library(tidyverse)
library(phyloseq)
# library(dada2)
# library(Biostrings)
# library(DECIPHER)
library(phangorn)
library(readr)
library(seqinr)
# library(decontam)
library(ape)
library(vegan)
#library(philr)
library(RColorBrewer)
# library(microbiome)
# library(DESeq2)
library(compositions)
# library(cowplot)
# library(plotly)
# library(htmlwidgets)
# library(withr)
# library(devtools)
library(SpiecEasi)
library(otuSummary)
library(psych)
library(Matrix)
library(igraph)
# Helper functions from J. Cram https://biovcnet.github.io/_pages/NetworkScience_SparCC.nb
pass <- function(x){x}
# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
cormat[upper.tri(cormat)] <- NA
return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
reorder_cormat <- function(cormat){
# Use correlation between variables as distance
dd <- as.dist((1-cormat)/2)
hc <- hclust(dd)
cormat <-cormat[hc$order, hc$order]
}
reorder_cor_and_p <- function(cormat, pmat){
dd <- as.dist((1-cormat)/2)
hc <- hclust(dd)
cormat <-cormat[hc$order, hc$order]
pmat <- pmat[hc$order, hc$order]
list(r = cormat, p = pmat)
}
#Custom colorblind pallette, see: https://stackoverflow.com/questions/57153428/r-plot-color-combinations-that-are-colorblind-accessible
customvermillion<-rgb(213/255,94/255,0/255)
custombluegreen<-rgb(0/255,158/255,115/255)
customblue<-rgb(0/255,114/255,178/255)
customskyblue<-rgb(86/255,180/255,233/255)
customreddishpurple<-rgb(204/255,121/255,167/255)
Metadata:
metadata <- read_csv("Metadata.csv")
[36m──[39m [1m[1mColumn specification[1m[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
.default = col_double(),
`Sample Name` = [31mcol_character()[39m,
Replicate = [31mcol_character()[39m,
Type = [31mcol_character()[39m,
SizeFraction = [31mcol_character()[39m,
Season = [31mcol_character()[39m,
OxCond = [31mcol_character()[39m
)
[36mℹ[39m Use [38;5;235m[48;5;253m[38;5;235m[48;5;253m`spec()`[48;5;253m[38;5;235m[49m[39m for the full column specifications.
Import SRA table and match SRA IDs with sample IDs in metadata file
SRARunTable <- read_csv("sra_data/SraRunTable.txt")
[36m──[39m [1m[1mColumn specification[1m[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
.default = col_character(),
AvgSpotLen = [32mcol_double()[39m,
Bases = [32mcol_double()[39m,
Bytes = [32mcol_double()[39m,
ReleaseDate = [34mcol_datetime(format = "")[39m,
Depth_m = [32mcol_double()[39m,
CH4_uM = [32mcol_double()[39m,
H2S_Um = [32mcol_double()[39m,
Oxygen_uM = [32mcol_double()[39m,
Particulate_Sulfur_uM = [32mcol_double()[39m,
salinity = [32mcol_double()[39m,
Temperature_degree_C = [32mcol_double()[39m,
TZVS_uM = [32mcol_double()[39m
)
[36mℹ[39m Use [38;5;235m[48;5;253m[38;5;235m[48;5;253m`spec()`[48;5;253m[38;5;235m[49m[39m for the full column specifications.
metadata <- left_join(metadata, SRARunTable, by = 'Sample Name')
DADA2 results:
# Import Count table. Skip first row of tsv file, which is just some text
count_table <- read_tsv(file="dada2_export/ASVs_counts.tsv")
Missing column names filled in: 'X1' [1]
[36m──[39m [1m[1mColumn specification[1m[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
.default = col_double(),
X1 = [31mcol_character()[39m
)
[36mℹ[39m Use [38;5;235m[48;5;253m[38;5;235m[48;5;253m`spec()`[48;5;253m[38;5;235m[49m[39m for the full column specifications.
# And specify that the first column of data are rownames
count_table <- column_to_rownames(count_table, var = colnames(count_table)[1])
# Import taxonomy of ASVs
taxonomy <- read_tsv(file="dada2_export/ASVs_taxonomy.tsv")
Missing column names filled in: 'X1' [1]
[36m──[39m [1m[1mColumn specification[1m[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
X1 = [31mcol_character()[39m,
Kingdom = [31mcol_character()[39m,
Supergroup = [31mcol_character()[39m,
Division = [31mcol_character()[39m,
Class = [31mcol_character()[39m,
Order = [31mcol_character()[39m,
Family = [31mcol_character()[39m,
Genus = [31mcol_character()[39m,
Species = [31mcol_character()[39m
)
# And specify that the first column of data are rownames
taxonomy <- column_to_rownames(taxonomy, var = colnames(taxonomy)[1])
# Use rarecurve, from the Vegan package. Rarcurve expects the dataset as a dataframe so we need to use as.data.frame again:
count_table_df <- as.data.frame(count_table)
# Plot the rarefaction curves, color-coding by the colors listed in sample_info_tab, which indicate sample type, and transforming using t() again
# Running this 5-10 samples at a time because otherwise it takes a long time to render
rarecurve(t(count_table_df), step=100, cex=0.5, ylab="ASVs", label=T)
count_table_no_singletons <- filter(count_table,rowSums(count_table)>1)
# retains all ASVs (out of 14176)
and change sample names from NCBI ID to our internal sample IDs
# Modify taxa names in count_table_no_singletons, which are the NCBI SRA numbers. Want to use our internal sample key
key <- SRARunTable %>% select(Run, 'Sample Name')
x <- (t(count_table_no_singletons))
x <- as.data.frame(cbind(x, Run = rownames(x)))
y <- t(left_join(x, key, by = "Run"))
colnames(y) <- y['Sample Name',]
y <- y[ !(rownames(y) %in% c('Sample Name', 'Run')), ]
count_table_2 <- type_convert(as.data.frame(y))
[36m──[39m [1m[1mColumn specification[1m[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
.default = col_double()
)
[36mℹ[39m Use [38;5;235m[48;5;253m[38;5;235m[48;5;253m`spec()`[48;5;253m[38;5;235m[49m[39m for the full column specifications.
This process takes a LONG time so run once and save .RData object In the Dada2 tools, there are no options to build a tree (unlike in Qiime2) but we can build it here using DECIPHER and phangorn
(Based on https://f1000research.com/articles/5-1492/v2)
Make an alignment using tools from Decipher (Note- alignment step takes several hours. Commented out for now. Only need to run once)
## import fasta
# fas <- "dada2_export/ASVs.fa"
# seqs <- readDNAStringSet(fas)
# seqs
#
# # perform the alignment
# aligned <- AlignSeqs(seqs) # automatically detects and uses all cores
#
# # view the alignment in a browser (optional)
# BrowseSeqs(aligned, highlight=0)
#
# # write out aligned sequence file
# writeXStringSet(aligned, file="ASVs.aligned.fasta")
Use phangorn package to build tree. Here we are building a maximum likelihood neighbor-joining tree. (Also takes a while to run. Comment out for now.)
# phang.align <- phyDat(as(aligned, "matrix"), type="DNA") # convert to phyDat format
# dm <- dist.ml(phang.align) # calculate pairwise distance matrix
# treeNJ <- NJ(dm) # perform neighbor-joining tree method
# fit = pml(treeNJ, data=phang.align) # compute intermal max likelihood
Since the step above takes a long time, save all variables up to this point in environment as RData object
save.image("EnvironmentBackups/CariacoEuks_postanalysis_vars_upto_tree.RData")
Re-load
load("EnvironmentBackups/CariacoEuks_postanalysis_vars_upto_tree.RData")
Here we will do ordinations using the phyloseq package, which first requires making phyloseq objects out of each of our input data tables (in the last tutorial, I imported the tree using phyloseq so it is already a phyloseq object)
ASV = otu_table(count_table_2, taxa_are_rows = TRUE)
There were 15 warnings (use warnings() to see them)
TAX = tax_table(as.matrix(taxonomy))
META = sample_data(data.frame(metadata, row.names = metadata$`Sample Name`))
TREE = phy_tree(fit$tree)
First check that the inputs are in compatible formats by checking for ASV names with the phyloseq function, taxa_names
head(taxa_names(TAX))
[1] "ASV_1" "ASV_2" "ASV_3" "ASV_4" "ASV_5" "ASV_6"
head(taxa_names(ASV))
[1] "ASV_1" "ASV_2" "ASV_3" "ASV_4" "ASV_5" "ASV_6"
head(taxa_names(TREE))
[1] "ASV_1" "ASV_2" "ASV_3" "ASV_4" "ASV_5" "ASV_6"
And check sample names were also detected
head(sample_names(ASV))
[1] "AE3a103A" "AE3b103A" "AE1b900AM" "AE3a103B" "AE3b103B" "AE3a198B"
head(sample_names(META))
[1] "AE3a103A" "AE3b103A" "AE3a198A" "AE3b198A" "AE3a234A" "AE3b234A"
And make the phyloseq object
ps <- phyloseq(ASV, TAX, META , TREE)
Check some features of the phyloseq object
rank_names(ps)
[1] "Kingdom" "Supergroup" "Division" "Class" "Order" "Family" "Genus"
[8] "Species"
table(tax_table(ps)[, "Supergroup"], exclude = NULL)
Alveolata Amoebozoa Apusozoa Archaeplastida Excavata Hacrobia
8880 9 45 108 9 395
Opisthokonta Rhizaria Stramenopiles <NA>
768 2405 1086 471
unique(tax_table(ps)[, "Supergroup"])
Taxonomy Table: [10 taxa by 1 taxonomic ranks]:
Supergroup
ASV_1 "Alveolata"
ASV_2 "Rhizaria"
ASV_6 "Stramenopiles"
ASV_18 "Opisthokonta"
ASV_78 "Hacrobia"
ASV_148 "Archaeplastida"
ASV_193 NA
ASV_557 "Apusozoa"
ASV_1114 "Amoebozoa"
ASV_2665 "Excavata"
Filter out those ambigious Supergroup annotations- losing 471 ASVs
ps <- subset_taxa(ps, !is.na(Supergroup) & !Supergroup %in% c("", "NA"))
table(tax_table(ps)[, "Supergroup"], exclude = NULL)
Alveolata Amoebozoa Apusozoa Archaeplastida Excavata Hacrobia
8880 9 45 108 9 395
Opisthokonta Rhizaria Stramenopiles
768 2405 1086
Check out the Division names
table(tax_table(ps)[, "Division"], exclude = NULL)
Apicomplexa Apusomonadidae Centroheliozoa Cercozoa
29 26 40 246
Chlorophyta Choanoflagellida Ciliophora Cryptophyta
64 54 407 50
Dinoflagellata Discoba Foraminifera Fungi
8330 1 2 57
Haptophyta Hilomonadea Katablepharidophyta Lobosa
215 17 2 9
Mesomycetozoa Metamonada Metazoa Ochrophyta
17 8 561 453
Opalozoa Opisthokonta_X Perkinsea Picozoa
216 14 5 61
Pseudofungi Radiolaria Rhodophyta Sagenista
72 2155 4 186
Stramenopiles_X Streptophyta Telonemia <NA>
61 38 27 278
Filter out any with “NA” as Division
ps <- subset_taxa(ps, !is.na(Division) & !Division %in% c(""))
table(tax_table(ps)[, "Division"], exclude = NULL)
Apicomplexa Apusomonadidae Centroheliozoa Cercozoa
29 26 40 246
Chlorophyta Choanoflagellida Ciliophora Cryptophyta
64 54 407 50
Dinoflagellata Discoba Foraminifera Fungi
8330 1 2 57
Haptophyta Hilomonadea Katablepharidophyta Lobosa
215 17 2 9
Mesomycetozoa Metamonada Metazoa Ochrophyta
17 8 561 453
Opalozoa Opisthokonta_X Perkinsea Picozoa
216 14 5 61
Pseudofungi Radiolaria Rhodophyta Sagenista
72 2155 4 186
Stramenopiles_X Streptophyta Telonemia
61 38 27
After the above, 13,427 ASVs remain from the original 14,177
Eliminate the libraries that didn’t have many sequences, AE3a198A, AE3b314A, AE2a200A, AE2b900AN, AE2a200B, AE2a267B, AE2a900BN
taxa_to_keep <- !sample_names(ps) %in% c("AE3a198A","AE3b314A","AE2a200A","AE2b900AN","AE2a200B","AE2a267B","AE2a900BN")
ps <- prune_samples(taxa_to_keep, ps)
41 samples remain and stil 13,427 ASVs
Check rarefaction curve again to make sure those low-sqeuencing-effort samples have been removed
rarecurve(t(otu_table(ps)), step=100, cex=0.5, ylab="ASVs", label=T)
Have to do this because you may have removed the root of your tree when pruning). (I found this handy function from here which picks the longest branch to root from).
There were 15 warnings (use warnings() to see them)
# first define function from link above to find furthest outgroup
pick_new_outgroup <- function(tree.unrooted){
require("magrittr")
require("data.table")
require("ape") # ape::Ntip
# tablify parts of tree that we need.
treeDT <-
cbind(
data.table(tree.unrooted$edge),
data.table(length = tree.unrooted$edge.length)
)[1:Ntip(tree.unrooted)] %>%
cbind(data.table(id = tree.unrooted$tip.label))
# Take the longest terminal branch as outgroup
new.outgroup <- treeDT[which.max(length)]$id
return(new.outgroup) }
# then run on my phyloseq tree
my.tree <- phy_tree(ps)
out.group <- pick_new_outgroup(my.tree)
out.group
[1] "ASV_10740"
# Then use this outgroup to root the tree
new.tree1 <- ape::root(my.tree, outgroup=out.group, resolve.root=TRUE)
phy_tree(ps) <- new.tree1
# Check if tree is binary (dichotomous not multichotomous)
is.binary.tree(phy_tree(ps))
[1] TRUE
# If false, would have to run
# new.tree2 <- ape::multi2di(new.tree1)
# phy_tree(ps) <- new.tree2
# phy_tree(ps)
Check overall how the phyla are distributed among samples. Phyloseq makes this easy
# First aglomerate the ASVs at the phylum level using the phyloseq function, tax_glom
DivisionGlommed = tax_glom(ps, "Division")
# There are many phyla here, so have to make a custom color palette by interpolating from an existing one in RColorBrewer
colourCount = length(table(tax_table(ps)[, "Division"], exclude = NULL))
getPalette = colorRampPalette(brewer.pal(11, "Spectral"))
DivisionPalette = getPalette(colourCount)
# and plot
plot_bar(DivisionGlommed, x = "Sample", fill = "Division") +
scale_fill_manual(values = DivisionPalette)
Plot compositional (relative abundances) instead of absolute abundance using microbiome::transform
ps_ra <- microbiome::transform(ps, transform = "compositional")
(otu_table(ps_ra))[1:5,1:5]
OTU Table: [5 taxa and 5 samples]
taxa are rows
AE3a103A AE3b103A AE1b900AM AE3a103B AE3b103B
ASV_1 4.046390e-04 0.000105531 2.462054e-05 0.000000e+00 2.400346e-05
ASV_2 0.000000e+00 0.000000000 3.132963e-02 0.000000e+00 5.600807e-05
ASV_3 6.674871e-03 0.014117702 2.265089e-02 3.696079e-03 1.055352e-02
ASV_4 1.244014e-03 0.001524337 1.231027e-05 4.769134e-05 6.720968e-04
ASV_5 2.675299e-05 0.000000000 0.000000e+00 7.948557e-06 1.040150e-04
# Then aglomerate the ASVs at the phylum level using the phyloseq function, tax_glom
DivisionGlommed_RA = tax_glom(ps_ra, "Division")
# and plot
Division_barplot <- plot_bar(DivisionGlommed_RA, x = "Sample", fill = "Division") +
scale_fill_manual(values = DivisionPalette) +
theme(legend.text = element_text(size = 10))
Division_barplot
# export
ggsave("Figures/Division_barplot.eps",Division_barplot, width = 15, height = 5, units = c("in"))
Lots of dinoflagellates and radiolaria. Makes sense. But the above is the distribution from all samples. Next make plots that indicate distributions across environmental gradients. Calculate averages and use bubble plots
Get average relative abundances from sample replicates
otu_table_mean_ra <-
mutate(data.frame(otu_table(ps_ra)), "103A" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a103A","AE3b103A")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "198A" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3b198A")), na.rm = TRUE)) %>% # Sample AE3a198A was removed
mutate(data.frame(otu_table(ps_ra)), "234A" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a234A","AE3b234A")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "295A" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a295A","AE3b295A")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "314A" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a314A")), na.rm = TRUE)) %>% # Sample AE3b314A was removed
mutate(data.frame(otu_table(ps_ra)), "900AM" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a900AM","AE1b900AM")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "103B" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a103B","AE3b103B")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "198B" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a198B","AE3b198B")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "234B" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a234B","AE3b234B")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "295B" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a295B","AE3b295B")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "314B" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a314B","AE3b314B")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "900BM" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a900BM","AE1b900BM")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "143A" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2a143A","AE2b143A")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "200A" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2b200A")), na.rm = TRUE)) %>% # AE2a200A was removed
mutate(data.frame(otu_table(ps_ra)), "237A" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2a237A","AE2b237A")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "247A" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2a247A","AE2b247A")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "267A" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2a267A","AE2b267A")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "900AN" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2a900AN")), na.rm = TRUE)) %>% # AE2b900AN was removed
mutate(data.frame(otu_table(ps_ra)), "143B" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2a143B","AE2b143B")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "200B" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2b200B")), na.rm = TRUE)) %>% # AE2a200B was removed
mutate(data.frame(otu_table(ps_ra)), "237B" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2a237B","AE2b237B")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "247B" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2a247B","AE2b247B")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "267B" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2b267B")), na.rm = TRUE)) %>% # AE2a267B was removed
mutate(data.frame(otu_table(ps_ra)), "900BN" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2b900BN")), na.rm = TRUE)) # AE2a900BN was removed
otu_table_mean_ra <- otu_table_mean_ra[,unique(metadata$Replicate)]
otu_table_mean_ra
Make into new phyloseq object
metadata2 <- unique(select(metadata,!c('Sample Name',Type,colnames(SRARunTable))))
META2 <- sample_data(data.frame(metadata2, row.names = metadata2$Replicate))
ps_ra_mean <- phyloseq(otu_table(otu_table_mean_ra, taxa_are_rows = TRUE), TAX, TREE, META2)
# First aglomerate the ASVs at the phylum level using the phyloseq function, tax_glom
ps_ra_mean_division <- tax_glom(ps_ra_mean, "Division")
# and check by bar plotting
plot_bar(ps_ra_mean_division, x = "Sample", fill = "Division") +
scale_fill_manual(values = DivisionPalette)
Extract mean relative abundance, glommed by division, from the phyloseq object and pair it to taxonomic data
division_df <- data.frame(otu_table(ps_ra_mean_division))
colnames(division_df) <- colnames(otu_table(ps_ra_mean_division))
division_df$ASV <- rownames(division_df)
otu_table_mean_ra <- left_join(division_df, as_tibble(taxonomy, rownames = "ASV"), by = "ASV")
otu_table_mean_ra
Pivot longer
otu_table_mean_ra <- pivot_longer(otu_table_mean_ra, cols = unique(metadata$Replicate), names_to = "Replicate", values_to = "Mean_RA")
otu_table_mean_ra
Join metadata
otu_table_mean_ra <- left_join(otu_table_mean_ra, unique(select(metadata, c("Replicate", "Depth", "SizeFraction", "Season", "OxCond", "Fluorescence", "BeamAtt", "O2", "Temp", "Salinity", "H2S", "ParticulateS", "TZVS", "CH4", "NO3", "NO2", "NH4", "PO4", "Chemoautotrophy", "BNP", "MicroAbun(x10^8 L^-1)", "FlagAbun(x10^5 L-1)", "VLP(x10^8 L-1)"))), by = "Replicate")
# Replace zeroes in RA with NA (better for plotting)
otu_table_mean_ra$Mean_RA[otu_table_mean_ra$Mean_RA == 0] <- NA
otu_table_mean_ra
# reorder some factors to make them plot in the order I want
otu_table_mean_ra$OxCond <- factor(otu_table_mean_ra$OxCond, levels = c("Oxycline", "ShallowAnoxic", "Euxinic"))
otu_table_mean_ra$SizeFraction <- factor(otu_table_mean_ra$SizeFraction, levels = c("PA", "FL"))
euk_divisions_bubbleplot_color <- ggplot(otu_table_mean_ra,aes (x = as.character(Depth), y = reorder(Division, Mean_RA, function(x){sum(x,na.rm = TRUE)}), color = OxCond)) +
geom_point(aes(size =Mean_RA))+
facet_wrap(Season~SizeFraction, scales = "free_x", drop= TRUE, ncol = 4) +
scale_size(range = c(1,15)) +
scale_size_area(breaks = c(0,.25,.5,.75,1), max_size = 6) +
xlab("Depth") +
ylab("") +
labs(size="Relative Abundance", color = "Redox Condition") +
scale_color_manual(values = c("blue", "red", "brown4")) +
theme_bw() +
theme(axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10),
axis.title.x= element_text(size=12),
axis.title.y= element_text(size=12))
Scale for 'size' is already present. Adding another scale for 'size', which will replace
the existing scale.
euk_divisions_bubbleplot_color
Save figure
ggsave(filename = "Figures/euk_divisions_bubbleplot_color.eps", plot = euk_divisions_bubbleplot_color, units = c("in"), width = 10, height = 6, dpi = 300)
save.image("EnvironmentBackups/CariacoEuks_postanalysis_vars_upto_bubbleplots.RData")
Re-load
load("EnvironmentBackups/CariacoEuks_postanalysis_vars_upto_bubbleplots.RData")
Import
arch_counts <- read_csv("Suter_2018_count_tables/Cariaco_AA_updated_raw.csv");
[36m──[39m [1m[1mColumn specification[1m[22m [36m──────────────────────────────────────────────────────────────────────[39m
cols(
.default = col_double(),
`#OTU ID` = [31mcol_character()[39m,
`taxonomy-1` = [31mcol_character()[39m,
`taxonomy-2` = [31mcol_character()[39m,
`taxonomy-3` = [31mcol_character()[39m,
`taxonomy-4` = [31mcol_character()[39m,
`taxonomy-5` = [31mcol_character()[39m,
`taxonomy-6` = [31mcol_character()[39m,
`taxonomy-7` = [31mcol_character()[39m,
`taxonomy-8` = [33mcol_logical()[39m
)
[36mℹ[39m Use [38;5;235m[48;5;253m[38;5;235m[48;5;253m`spec()`[48;5;253m[38;5;235m[49m[39m for the full column specifications.
1 parsing failure.
row col expected actual file
37552 taxonomy-8 1/0/T/F/TRUE/FALSE Methanobacterium_sp._GH 'Suter_2018_count_tables/Cariaco_AA_updated_raw.csv'
bac_counts <- read_csv("Suter_2018_count_tables/Cariaco_AB_updated_raw.csv");
[36m──[39m [1m[1mColumn specification[1m[22m [36m──────────────────────────────────────────────────────────────────────[39m
cols(
.default = col_double(),
`#OTU ID` = [31mcol_character()[39m,
`Refined taxonomy` = [31mcol_character()[39m,
`Interesting close relatives` = [31mcol_character()[39m,
`taxonomy-1` = [31mcol_character()[39m,
`taxonomy-2` = [31mcol_character()[39m,
`taxonomy-3` = [31mcol_character()[39m,
`taxonomy-4` = [31mcol_character()[39m,
`taxonomy-5` = [31mcol_character()[39m,
`taxonomy-6` = [31mcol_character()[39m,
`taxonomy-7` = [31mcol_character()[39m,
`taxonomy-8` = [31mcol_character()[39m,
`taxonomy-9` = [33mcol_logical()[39m
)
[36mℹ[39m Use [38;5;235m[48;5;253m[38;5;235m[48;5;253m`spec()`[48;5;253m[38;5;235m[49m[39m for the full column specifications.
7 parsing failures.
row col expected actual file
146592 taxonomy-9 1/0/T/F/TRUE/FALSE Kofleria_flava 'Suter_2018_count_tables/Cariaco_AB_updated_raw.csv'
146593 taxonomy-9 1/0/T/F/TRUE/FALSE Kofleria_flava 'Suter_2018_count_tables/Cariaco_AB_updated_raw.csv'
146594 taxonomy-9 1/0/T/F/TRUE/FALSE Kofleria_flava 'Suter_2018_count_tables/Cariaco_AB_updated_raw.csv'
146595 taxonomy-9 1/0/T/F/TRUE/FALSE Kofleria_flava 'Suter_2018_count_tables/Cariaco_AB_updated_raw.csv'
146596 taxonomy-9 1/0/T/F/TRUE/FALSE Kofleria_flava 'Suter_2018_count_tables/Cariaco_AB_updated_raw.csv'
...... .......... .................. .............. ....................................................
See problems(...) for more details.
Get sample names
bac_samples <- colnames(bac_counts)[2:49]
arch_samples <- colnames(arch_counts)[2:47]
bac_samples
[1] "AB2a237B" "AB2b267B" "AB2a267B" "AB2b237B" "AB3b103B" "AB2b200A" "AB2b247B"
[8] "AB3a103B" "AB3b314B" "AB2a247B" "AB3b295B" "AB2a200B" "AB3a295B" "AB3a314B"
[15] "AB2a237A" "AB3b198B" "AB2b143B" "AB2a143B" "AB3a198A" "AB3b234B" "AB3a198B"
[22] "AB2a247A" "AB2b200B" "AB3b198A" "AB3b234A" "AB3a234B" "AB3a295A" "AB3a314A"
[29] "AB2a900BN" "AB3a900B" "AB2b900BN" "AB2b267A" "AB3b295A" "AB2a900AN" "AB2b143A"
[36] "AB3b103A" "AB3a103A" "AB3a234A" "AB2b247A" "AB3b314A" "AB2b900AN" "AB2a267A"
[43] "AB1b900A" "AB2b237A" "AB2a143A" "AB1b900B" "AB2a200A" "AB3a900A"
arch_samples
[1] "AA3a314A" "AA2a237B" "AA2b237A" "AA3a900B" "AA1b900A" "AA3a103B" "AA2a247A"
[8] "AA1b900B" "AA3b198A" "AA2b237B" "AA2b143A" "AA3b103A" "AA2a143A" "AA3a103A"
[15] "AA3a198A" "AA2a200A" "AA3b198B" "AA3b234B" "AA2a200B" "AA2a143B" "AA3a234A"
[22] "AA3a295B" "AA2b247B" "AA3a234B" "AA2b247A" "AA3b234A" "AA3b314A" "AA3a314B"
[29] "AA3a198B" "AA3b295A" "AA3a295A" "AA2a237A" "AA3a900A" "AA2b200A" "AA2b267B"
[36] "AA3b314B" "AA2b143B" "AA2b200B" "AA3b103B" "AA3b295B" "AA2a267B" "AA2a247B"
[43] "AA2a900AN" "AA2b900BN" "AA2a900BN" "AA2b900AN"
Make separate taxonomy and count variables
arch_OTU <- arch_counts[,c("#OTU ID",arch_samples)]
arch_taxonomy <- arch_counts %>%
select(-arch_samples) %>%
select(-Sum)
arch_OTU
arch_taxonomy
bac_OTU <- bac_counts[,c("#OTU ID",bac_samples)]
bac_taxonomy <- bac_counts %>%
select(-bac_samples) %>%
select(-Sum) %>%
select(-"Interesting close relatives")
bac_OTU
bac_taxonomy
bac_OTU <- type_convert(as.data.frame(bac_OTU))
[36m──[39m [1m[1mColumn specification[1m[22m [36m──────────────────────────────────────────────────────────────────────[39m
cols(
`#OTU ID` = [31mcol_character()[39m
)
rownames(bac_OTU) <- bac_OTU$`#OTU ID`
bac_OTU <- bac_OTU[,!names(bac_OTU) %in% (c("#OTU ID"))]
bac_OTU = otu_table(bac_OTU, taxa_are_rows = TRUE)
#
arch_OTU <- type_convert(as.data.frame(arch_OTU))
[36m──[39m [1m[1mColumn specification[1m[22m [36m──────────────────────────────────────────────────────────────────────[39m
cols(
`#OTU ID` = [31mcol_character()[39m
)
rownames(arch_OTU) <- arch_OTU$`#OTU ID`
arch_OTU <- arch_OTU[,!names(arch_OTU) %in% (c("#OTU ID"))]
arch_OTU = otu_table(arch_OTU, taxa_are_rows = TRUE)
#
bac_TAX <- type_convert(as.data.frame(bac_taxonomy))
[36m──[39m [1m[1mColumn specification[1m[22m [36m──────────────────────────────────────────────────────────────────────[39m
cols(
`#OTU ID` = [31mcol_character()[39m,
`Refined taxonomy` = [31mcol_character()[39m,
`taxonomy-1` = [31mcol_character()[39m,
`taxonomy-2` = [31mcol_character()[39m,
`taxonomy-3` = [31mcol_character()[39m,
`taxonomy-4` = [31mcol_character()[39m,
`taxonomy-5` = [31mcol_character()[39m,
`taxonomy-6` = [31mcol_character()[39m,
`taxonomy-7` = [31mcol_character()[39m,
`taxonomy-8` = [31mcol_character()[39m
)
rownames(bac_TAX) <- bac_TAX$`#OTU ID`
bac_TAX <- bac_TAX[,!names(bac_TAX) %in% (c("#OTU ID"))]
bac_TAX = tax_table(as.matrix(bac_TAX))
#
arch_TAX <- type_convert(as.data.frame(arch_taxonomy))
[36m──[39m [1m[1mColumn specification[1m[22m [36m──────────────────────────────────────────────────────────────────────[39m
cols(
`#OTU ID` = [31mcol_character()[39m,
`taxonomy-1` = [31mcol_character()[39m,
`taxonomy-2` = [31mcol_character()[39m,
`taxonomy-3` = [31mcol_character()[39m,
`taxonomy-4` = [31mcol_character()[39m,
`taxonomy-5` = [31mcol_character()[39m,
`taxonomy-6` = [31mcol_character()[39m,
`taxonomy-7` = [31mcol_character()[39m
)
rownames(arch_TAX) <- arch_TAX$`#OTU ID`
arch_TAX <- arch_TAX[,!names(arch_TAX) %in% (c("#OTU ID"))]
arch_TAX = tax_table(as.matrix(arch_TAX))
#
META = sample_data(data.frame(metadata, row.names = metadata$`Sample Name`))
#
ps_bac <- phyloseq(bac_OTU, bac_TAX, META)
ps_arch <- phyloseq(arch_OTU, arch_TAX, META)
Filter out the samples with low sequencing effort. These were previously identified for itags paper
taxa_to_keep_b <- !sample_names(ps_bac) %in% c("AB3a900A","AB2a200A","AB2b267A")
ps_bac <- prune_samples(taxa_to_keep_b, ps_bac)
taxa_to_keep_a <- !sample_names(ps_arch) %in% c("AA2b900AN","AA2a247B","AA2a900BN","AA2b900BN")
ps_arch <- prune_samples(taxa_to_keep_a, ps_arch)
First calculate relative abdunance of bac and arch OTU tables
ps_bac_ra <- microbiome::transform(ps_bac, transform = "compositional")
(otu_table(ps_bac_ra))[1:5,1:5]
OTU Table: [5 taxa and 5 samples]
taxa are rows
AB2a237B AB2b267B AB2a267B AB2b237B AB3b103B
denovo231149 0.2125960670 0.5369359318 0.3471806593 0.2100273349 7.856409e-05
denovo348086 0.0014568676 0.0138326887 0.0023327600 0.0012899324 6.043392e-06
denovo302903 0.0001624382 0.0034240801 0.0009700197 0.0001292371 3.021696e-06
denovo104772 0.0310612290 0.0005181989 0.0143657171 0.0281322315 2.243911e-02
denovo309274 0.0620260104 0.0440345135 0.0583425623 0.0681006294 1.869221e-02
ps_arch_ra <- microbiome::transform(ps_arch, transform = "compositional")
(otu_table(ps_arch_ra))[1:5,1:5]
OTU Table: [5 taxa and 5 samples]
taxa are rows
AA3a314A AA2a237B AA2b237A AA3a900B AA1b900A
denovo180502 0.181534684 0.08790871 0.21422859 0.0028892496 0.032998864
denovo80843 0.129102961 0.07914636 0.05450465 0.0043043922 0.029778944
denovo217943 0.025004870 0.21867787 0.16709842 0.0009620487 0.021373974
denovo94410 0.009662359 0.13278112 0.10651172 0.0007448119 0.008834576
denovo199225 0.019405384 0.02748757 0.10588944 0.0008813608 0.009570436
Remove rows of glommed taxa from the full dataframe if their sum across all samples doesn’t exceed 5% (RA > 0.05)
# Bacteria
x <- taxa_sums(ps_bac_ra)
# keepTaxa <- base::which(x > .05)
keepTaxa <- x>.05 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_bac_ra_pruned <- prune_taxa(keepTaxa, ps_bac_ra)
ps_bac_pruned <- prune_taxa(keepTaxa, ps_bac)
ps_bac_ra_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 124 taxa and 45 samples ]
sample_data() Sample Data: [ 45 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 124 taxa by 10 taxonomic ranks ]
ps_bac_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 124 taxa and 45 samples ]
sample_data() Sample Data: [ 45 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 124 taxa by 10 taxonomic ranks ]
# Archaea
x <- taxa_sums(ps_arch_ra)
# keepTaxa <- base::which(x > .05)
keepTaxa <- x>.05 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_arch_ra_pruned <- prune_taxa(keepTaxa, ps_arch_ra)
ps_arch_pruned <- prune_taxa(keepTaxa, ps_arch)
ps_arch_ra_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 52 taxa and 42 samples ]
sample_data() Sample Data: [ 42 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 52 taxa by 8 taxonomic ranks ]
ps_arch_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 52 taxa and 42 samples ]
sample_data() Sample Data: [ 42 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 52 taxa by 8 taxonomic ranks ]
# Eukaryotes
x <- taxa_sums(ps_ra)
# keepTaxa <- base::which(x > .05)
keepTaxa <- x>.05 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_euk_ra_pruned <- prune_taxa(keepTaxa, ps_ra)
ps_euk_pruned <- prune_taxa(keepTaxa, ps)
ps_euk_ra_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 123 taxa and 41 samples ]
sample_data() Sample Data: [ 41 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 123 taxa by 8 taxonomic ranks ]
phy_tree() Phylogenetic Tree: [ 123 tips and 122 internal nodes ]
ps_euk_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 123 taxa and 41 samples ]
sample_data() Sample Data: [ 41 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 123 taxa by 8 taxonomic ranks ]
phy_tree() Phylogenetic Tree: [ 123 tips and 122 internal nodes ]
Trimmed to 124 bacteria OTUs, 52 archaea OTUs, and 123 eukaryotic ASVs (299 total). Proceed with this dataset of the most abundant OTUs for correlations and network analyses…
To do the multi-domain analysis, the sample names from each phyloseq object must match. These currently have “B” for bacteria, A, E etc. Remove this letter from sample names so that “AE2a247B”, “AA2a247B”, “AB2a247B” all become just “Type” from the metadata sheet [IntNov1FL in this case- for Interface, November, rep 1, free-living].
Import my SampleKey
samplekey <- read_csv("SampleKey.csv")
[36m──[39m [1m[1mColumn specification[1m[22m [36m──────────────────────────────────────────────────────────────────────[39m
cols(
Type = [31mcol_character()[39m,
SampleID_bac = [31mcol_character()[39m,
SampleID_arch = [31mcol_character()[39m,
SampleID_euk = [31mcol_character()[39m
)
Change the sample names in the otu tables to sample “Type”
# Archaea
# remove missing archaea samples from samplekey_A
samplekey_A <- filter(samplekey, SampleID_arch %in% colnames(otu_table(ps_arch_ra_pruned)))
# sort SampleKey by order of column names from ps_arch_ra_pruned
samplekey_A <- samplekey_A %>% arrange(factor(SampleID_arch, levels = colnames(otu_table(ps_arch_ra_pruned))))
# replace col names of otu table from ps_arch_ra_pruned
sample_names(ps_arch_ra_pruned) <- samplekey_A$Type
# and ps_arch_pruned
sample_names(ps_arch_pruned) <- samplekey_A$Type
# Bacteria
samplekey_B <- filter(samplekey, SampleID_bac %in% colnames(otu_table(ps_bac_ra_pruned)))
samplekey_B <- samplekey_B %>% arrange(factor(SampleID_bac, levels = colnames(otu_table(ps_bac_ra_pruned))))
sample_names(ps_bac_ra_pruned) <- samplekey_B$Type
sample_names(ps_bac_pruned) <- samplekey_B$Type
# Eukaryotes
samplekey_E <- filter(samplekey, SampleID_euk %in% colnames(otu_table(ps_euk_ra_pruned)))
samplekey_E <- samplekey_E %>% arrange(factor(SampleID_euk, levels = colnames(otu_table(ps_euk_ra_pruned))))
sample_names(ps_euk_ra_pruned) <- samplekey_E$Type
sample_names(ps_euk_pruned) <- samplekey_E$Type
Move all pruned otu tables into one table by matching the sample Type- will use this for SparCC
alldomains_df <- bind_rows(data.frame(otu_table(ps_bac_pruned)), data.frame(otu_table(ps_arch_pruned)), data.frame(otu_table(ps_euk_pruned)))
alldomains_df
Change row names from “denovoXXX” to meaningful names
Remove columns with NAs. These are samples for which the library for at least one domain didn’t work (can’t do correlations with missing values in columns)
alldomains_df_full <- alldomains_df_full %>%
select_if(~ !any(is.na(.)))
t(alldomains_df_full)[1:5,1:5]
denovo231149 Proteobacteria Gammaproteobacteria Chromatiales
SubOxNov1FL 41881
AnoxNov2FL 216557
SubOxNov2FL 86132
OxicMay2FL 26
MicroOxNov2PA 7356
denovo348086 Deferribacteres Deferribacterales SAR406_clade(Marine_group_A)
SubOxNov1FL 287
AnoxNov2FL 5579
SubOxNov2FL 529
OxicMay2FL 2
MicroOxNov2PA 123
denovo302903 Deferribacteres Deferribacterales SAR406_clade(Marine_group_A)
SubOxNov1FL 32
AnoxNov2FL 1381
SubOxNov2FL 53
OxicMay2FL 1
MicroOxNov2PA 37
denovo104772 Proteobacteria Alphaproteobacteria SAR11_clade
SubOxNov1FL 6119
AnoxNov2FL 209
SubOxNov2FL 11537
OxicMay2FL 7426
MicroOxNov2PA 20010
denovo309274 Deferribacteres Deferribacterales SAR406_clade(Marine_group_A)
SubOxNov1FL 12219
AnoxNov2FL 17760
SubOxNov2FL 27928
OxicMay2FL 6186
MicroOxNov2PA 1362
alldomains_df <- alldomains_df %>%
select_if(~ !any(is.na(.)))
t(alldomains_df)[1:5,1:5]
denovo231149 denovo348086 denovo302903 denovo104772 denovo309274
SubOxNov1FL 41881 287 32 6119 12219
AnoxNov2FL 216557 5579 1381 209 17760
SubOxNov2FL 86132 529 53 11537 27928
OxicMay2FL 26 2 1 7426 6186
MicroOxNov2PA 7356 123 37 20010 1362
36 samples remain for correlation
Simlarly, make pruned datasets of the most abundant OTUs/ASVs in the oxycline, anoxic, and euxinic samples as separate datasets
Pull out samples and taxa from each redox regime
# Pull out oxycline bacteria sample IDs
oxyclinetypes_bac <- metadata %>%
filter(`Sample Name` %in% sample_names(ps_bac)) %>%
filter(OxCond == "Oxycline") %>%
select("Sample Name")
oxyclinetypes_bac <- unlist(c(unique(oxyclinetypes_bac)), use.names = FALSE)
# Pull out all bacteria from oxycline
ps_bac_oxycline <- prune_samples(oxyclinetypes_bac, ps_bac)
ps_bac_ra_oxycline <- prune_samples(oxyclinetypes_bac, ps_bac_ra)
# Pull out oxycline archaea sample IDs
oxyclinetypes_arch <- metadata %>%
filter(`Sample Name` %in% sample_names(ps_arch)) %>%
filter(OxCond == "Oxycline") %>%
select("Sample Name")
oxyclinetypes_arch <- unlist(c(unique(oxyclinetypes_arch)), use.names = FALSE)
# Pull out all archaea from oxycline
ps_arch_oxycline <- prune_samples(oxyclinetypes_arch, ps_arch)
ps_arch_ra_oxycline <- prune_samples(oxyclinetypes_arch, ps_arch_ra)
# Pull out oxycline eukaryotic sample IDs
oxyclinetypes_euk <- metadata %>%
filter(`Sample Name` %in% sample_names(ps)) %>%
filter(OxCond == "Oxycline") %>%
select("Sample Name")
oxyclinetypes_euk <- unlist(c(unique(oxyclinetypes_euk)), use.names = FALSE)
# Pull out all eukaryotes from oxycline
ps_euk_oxycline <- prune_samples(oxyclinetypes_euk, ps)
ps_euk_ra_oxycline <- prune_samples(oxyclinetypes_euk, ps_ra)
Filter out low abundance taxa from the oxycline samples. Use 5% as cutoff
# Bacteria
x <- taxa_sums(ps_bac_ra_oxycline)
keepTaxa <- x>.05 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_bac_ra_oxycline_pruned <- prune_taxa(keepTaxa, ps_bac_ra_oxycline)
ps_bac_oxycline_pruned <- prune_taxa(keepTaxa, ps_bac_oxycline)
ps_bac_ra_oxycline_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 79 taxa and 23 samples ]
sample_data() Sample Data: [ 23 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 79 taxa by 10 taxonomic ranks ]
ps_bac_oxycline_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 79 taxa and 23 samples ]
sample_data() Sample Data: [ 23 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 79 taxa by 10 taxonomic ranks ]
# Archaea
x <- taxa_sums(ps_arch_ra_oxycline)
keepTaxa <- x>.05 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_arch_ra_oxycline_pruned <- prune_taxa(keepTaxa, ps_arch_ra_oxycline)
ps_arch_oxycline_pruned <- prune_taxa(keepTaxa, ps_arch_oxycline)
ps_arch_ra_oxycline_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 36 taxa and 24 samples ]
sample_data() Sample Data: [ 24 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 36 taxa by 8 taxonomic ranks ]
ps_arch_oxycline_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 36 taxa and 24 samples ]
sample_data() Sample Data: [ 24 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 36 taxa by 8 taxonomic ranks ]
# Eukaryotes
x <- taxa_sums(ps_euk_ra_oxycline)
keepTaxa <- x>.05 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_euk_ra_oxycline_pruned <- prune_taxa(keepTaxa, ps_euk_ra_oxycline)
ps_euk_oxycline_pruned <- prune_taxa(keepTaxa, ps_euk_oxycline)
ps_euk_ra_oxycline_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 76 taxa and 21 samples ]
sample_data() Sample Data: [ 21 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 76 taxa by 8 taxonomic ranks ]
phy_tree() Phylogenetic Tree: [ 76 tips and 75 internal nodes ]
ps_euk_oxycline_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 76 taxa and 21 samples ]
sample_data() Sample Data: [ 21 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 76 taxa by 8 taxonomic ranks ]
phy_tree() Phylogenetic Tree: [ 76 tips and 75 internal nodes ]
79 bacteria, 36 archaea, 76 eukaryota remain
Change the sample names in the otu tables to “Type”
# Archaea
# remove missing archaea samples from samplekey_A
samplekey_A <- filter(samplekey, SampleID_arch %in% colnames(otu_table(ps_arch_ra_oxycline_pruned)))
# sort SampleKey by order of column names from ps_arch_ra_oxycline_pruned
samplekey_A <- samplekey_A %>% arrange(factor(SampleID_arch, levels = colnames(otu_table(ps_arch_ra_oxycline_pruned))))
# replace col names of otu table from ps_arch_ra_oxycline_pruned
sample_names(ps_arch_ra_oxycline_pruned) <- samplekey_A$Type
# and ps_arch_pruned
sample_names(ps_arch_oxycline_pruned) <- samplekey_A$Type
# Bacteria
samplekey_B <- filter(samplekey, SampleID_bac %in% colnames(otu_table(ps_bac_ra_oxycline_pruned)))
samplekey_B <- samplekey_B %>% arrange(factor(SampleID_bac, levels = colnames(otu_table(ps_bac_ra_oxycline_pruned))))
sample_names(ps_bac_ra_oxycline_pruned) <- samplekey_B$Type
sample_names(ps_bac_oxycline_pruned) <- samplekey_B$Type
# Eukaryotes
samplekey_E <- filter(samplekey, SampleID_euk %in% colnames(otu_table(ps_euk_ra_oxycline_pruned)))
samplekey_E <- samplekey_E %>% arrange(factor(SampleID_euk, levels = colnames(otu_table(ps_euk_ra_oxycline_pruned))))
sample_names(ps_euk_ra_oxycline_pruned) <- samplekey_E$Type
sample_names(ps_euk_oxycline_pruned) <- samplekey_E$Type
Move all pruned otu tables into one table by matching the samples names (“Type”)
alldomains_df_oxycline <- bind_rows(data.frame(otu_table(ps_bac_oxycline_pruned)), data.frame(otu_table(ps_arch_oxycline_pruned)), data.frame(otu_table(ps_euk_oxycline_pruned)))
alldomains_df_oxycline
Change row names from “denovoXXX” to meaningful names
alldomains_df_full_oxycline <- cbind(ID = rownames(alldomains_df_oxycline), alldomains_df_oxycline)
# start with only first 76 rows, which are bacteria. make one column of meaningful labels
temp1 <- left_join(alldomains_df_full_oxycline[1:76,], bac_taxonomy, by = c("ID" = "#OTU ID"))
temp1$New_ID <- paste(temp1$ID, temp1$"taxonomy-2", temp1$"taxonomy-3", temp1$"taxonomy-4")
temp1 <- select(temp1,-colnames(bac_taxonomy[,2:11]))
# next 7 rows are the archaea
temp2 <- left_join(alldomains_df_full_oxycline[77:80,], arch_taxonomy, by = c("ID" = "#OTU ID"))
temp2$New_ID <- paste(temp2$ID, temp2$"taxonomy-2", temp2$"taxonomy-3")
temp2 <- select(temp2,-colnames(arch_taxonomy[,2:9]))
# last 28 rows are eukarya
euk_taxonomy <- cbind("#ASV ID" = rownames(taxonomy), taxonomy)
temp3 <- left_join(alldomains_df_full_oxycline[81:106,], euk_taxonomy, by = c("ID" = "#ASV ID"))
temp3$New_ID <- paste(temp3$ID, temp3$"Supergroup", temp3$"Division", temp3$"Class", temp3$"Order")
temp3 <- select(temp3,-colnames(euk_taxonomy[,2:9]))
# combine back all 3 domains, with new names as row names in a dataframe
alldomains_df_full_oxycline <- rbind(temp1, temp2, temp3)
alldomains_df_full_oxycline <- data.frame(alldomains_df_full_oxycline)
rownames(alldomains_df_full_oxycline) <- alldomains_df_full_oxycline$New_ID
alldomains_df_full_oxycline <- select(alldomains_df_full_oxycline, -c("ID","New_ID"))
Remove columns with NAs. These are samples for which the library for at least one domain didn’t work (can’t do correlations with missing domains)
alldomains_df_full_oxycline <- alldomains_df_full_oxycline %>%
select_if(~ !any(is.na(.)))
t(alldomains_df_full_oxycline)[1:5,1:5]
denovo231149 Proteobacteria Gammaproteobacteria Chromatiales
SubOxNov1FL 41881
SubOxNov2FL 86132
OxicMay2FL 26
MicroOxNov2PA 7356
OxicMay1FL 9
denovo348086 Deferribacteres Deferribacterales SAR406_clade(Marine_group_A)
SubOxNov1FL 21392
SubOxNov2FL 45275
OxicMay2FL 6439
MicroOxNov2PA 2246
OxicMay1FL 9007
denovo104772 Proteobacteria Alphaproteobacteria SAR11_clade
SubOxNov1FL 6119
SubOxNov2FL 11537
OxicMay2FL 7426
MicroOxNov2PA 20010
OxicMay1FL 7183
denovo26278 Proteobacteria Gammaproteobacteria E01-9C-26_marine_group
SubOxNov1FL 4072
SubOxNov2FL 10114
OxicMay2FL 1631
MicroOxNov2PA 21544
OxicMay1FL 1410
denovo290780 Proteobacteria Gammaproteobacteria Oceanospirillales
SubOxNov1FL 3409
SubOxNov2FL 9018
OxicMay2FL 22751
MicroOxNov2PA 8493
OxicMay1FL 23923
21 samples remain for correlation
Pull out samples from shallow anoxic regime
# Pull out anoxic layer bacteria sample IDs
anoxictypes_bac <- metadata %>%
filter(`Sample Name` %in% sample_names(ps_bac_orders)) %>%
filter(OxCond == "ShallowAnoxic") %>%
select("Sample Name")
anoxictypes_bac <- unlist(c(unique(anoxictypes_bac)), use.names = FALSE)
# Pull out all bacteria from anoxic layer
ps_bac_anoxic <- prune_samples(anoxictypes_bac, ps_bac_orders)
ps_bac_ra_anoxic <- prune_samples(anoxictypes_bac, ps_bac_ra_orders)
# Pull out anoxic layer archaea sample IDs
anoxictypes_arch <- metadata %>%
filter(`Sample Name` %in% sample_names(ps_arch_classes)) %>%
filter(OxCond == "ShallowAnoxic") %>%
select("Sample Name")
anoxictypes_arch <- unlist(c(unique(anoxictypes_arch)), use.names = FALSE)
# Pull out all archaea from anoxic layer
ps_arch_anoxic<- prune_samples(anoxictypes_arch, ps_arch_classes)
ps_arch_ra_anoxic <- prune_samples(anoxictypes_arch, ps_arch_ra_classes)
# Pull out anoxic layer eukaryotic sample IDs
anoxictypes_euk <- metadata %>%
filter(`Sample Name` %in% sample_names(ps_euk_orders)) %>%
filter(OxCond == "ShallowAnoxic") %>%
select("Sample Name")
anoxictypes_euk <- unlist(c(unique(anoxictypes_euk)), use.names = FALSE)
# Pull out all eukaryotes from anoxic layer
ps_euk_anoxic <- prune_samples(anoxictypes_euk, ps_euk_orders)
ps_euk_ra_anoxic <- prune_samples(anoxictypes_euk, ps_euk_ra_orders)
Filter out low abundance taxa from the oxycline samples. Use 1% as cutoff since this is a smaller dataset
# Bacteria
x <- taxa_sums(ps_bac_ra_anoxic)
keepTaxa <- x>.01 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_bac_ra_anoxic_pruned <- prune_taxa(keepTaxa, ps_bac_ra_anoxic)
ps_bac_anoxic_pruned <- prune_taxa(keepTaxa, ps_bac_anoxic)
ps_bac_ra_anoxic_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 59 taxa and 15 samples ]
sample_data() Sample Data: [ 15 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 59 taxa by 10 taxonomic ranks ]
ps_bac_anoxic_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 59 taxa and 15 samples ]
sample_data() Sample Data: [ 15 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 59 taxa by 10 taxonomic ranks ]
# Archaea
x <- taxa_sums(ps_arch_ra_anoxic)
keepTaxa <- x>.01 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_arch_ra_anoxic_pruned <- prune_taxa(keepTaxa, ps_arch_ra_anoxic)
ps_arch_anoxic_pruned <- prune_taxa(keepTaxa, ps_arch_anoxic)
ps_arch_ra_anoxic_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 5 taxa and 13 samples ]
sample_data() Sample Data: [ 13 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 5 taxa by 8 taxonomic ranks ]
ps_arch_anoxic_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 5 taxa and 13 samples ]
sample_data() Sample Data: [ 13 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 5 taxa by 8 taxonomic ranks ]
# Eukaryotes
x <- taxa_sums(ps_euk_ra_anoxic)
keepTaxa <- x>.01 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_euk_ra_anoxic_pruned <- prune_taxa(keepTaxa, ps_euk_ra_anoxic)
ps_euk_anoxic_pruned <- prune_taxa(keepTaxa, ps_euk_anoxic)
ps_euk_ra_anoxic_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 31 taxa and 14 samples ]
sample_data() Sample Data: [ 14 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 31 taxa by 8 taxonomic ranks ]
phy_tree() Phylogenetic Tree: [ 31 tips and 30 internal nodes ]
ps_euk_anoxic_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 31 taxa and 14 samples ]
sample_data() Sample Data: [ 14 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 31 taxa by 8 taxonomic ranks ]
phy_tree() Phylogenetic Tree: [ 31 tips and 30 internal nodes ]
59 bacteria, 5 archaea, 31 eukaryota remain
Change the sample names in the otu tables to “Type”
# Archaea
# remove missing archaea samples from samplekey_A
samplekey_A <- filter(samplekey, SampleID_arch %in% colnames(otu_table(ps_arch_ra_anoxic_pruned)))
# sort SampleKey by order of column names from ps_arch_ra_anoxic_pruned
samplekey_A <- samplekey_A %>% arrange(factor(SampleID_arch, levels = colnames(otu_table(ps_arch_ra_anoxic_pruned))))
# replace col names of otu table from ps_arch_ra_anoxic_pruned
sample_names(ps_arch_ra_anoxic_pruned) <- samplekey_A$Type
# and ps_arch_pruned
sample_names(ps_arch_anoxic_pruned) <- samplekey_A$Type
# Bacteria
samplekey_B <- filter(samplekey, SampleID_bac %in% colnames(otu_table(ps_bac_ra_anoxic_pruned)))
samplekey_B <- samplekey_B %>% arrange(factor(SampleID_bac, levels = colnames(otu_table(ps_bac_ra_anoxic_pruned))))
sample_names(ps_bac_ra_anoxic_pruned) <- samplekey_B$Type
sample_names(ps_bac_anoxic_pruned) <- samplekey_B$Type
# Eukaryotes
samplekey_E <- filter(samplekey, SampleID_euk %in% colnames(otu_table(ps_euk_ra_anoxic_pruned)))
samplekey_E <- samplekey_E %>% arrange(factor(SampleID_euk, levels = colnames(otu_table(ps_euk_ra_anoxic_pruned))))
sample_names(ps_euk_ra_anoxic_pruned) <- samplekey_E$Type
sample_names(ps_euk_anoxic_pruned) <- samplekey_E$Type
Move all pruned otu tables into one table by matching the samples names (“Type”)
alldomains_df_anoxic <- bind_rows(data.frame(otu_table(ps_bac_anoxic_pruned)), data.frame(otu_table(ps_arch_anoxic_pruned)), data.frame(otu_table(ps_euk_anoxic_pruned)))
alldomains_df_anoxic
Change row names from “denovoXXX” to meaningful names
alldomains_df_full_anoxic <- cbind(ID = rownames(alldomains_df_anoxic), alldomains_df_anoxic)
# start with only first 76 rows, which are bacteria. make one column of meaningful labels
temp1 <- left_join(alldomains_df_full_anoxic[1:59,], bac_taxonomy, by = c("ID" = "#OTU ID"))
temp1$New_ID <- paste(temp1$ID, temp1$"taxonomy-2", temp1$"taxonomy-3", temp1$"taxonomy-4")
temp1 <- select(temp1,-colnames(bac_taxonomy[,2:11]))
# next 7 rows are the archaea
temp2 <- left_join(alldomains_df_full_anoxic[60:64,], arch_taxonomy, by = c("ID" = "#OTU ID"))
temp2$New_ID <- paste(temp2$ID, temp2$"taxonomy-2", temp2$"taxonomy-3")
temp2 <- select(temp2,-colnames(arch_taxonomy[,2:9]))
# last 28 rows are eukarya
euk_taxonomy <- cbind("#ASV ID" = rownames(taxonomy), taxonomy)
temp3 <- left_join(alldomains_df_full_anoxic[65:95,], euk_taxonomy, by = c("ID" = "#ASV ID"))
temp3$New_ID <- paste(temp3$ID, temp3$"Supergroup", temp3$"Division", temp3$"Class", temp3$"Order")
temp3 <- select(temp3,-colnames(euk_taxonomy[,2:9]))
# combine back all 3 domains, with new names as row names in a dataframe
alldomains_df_full_anoxic <- rbind(temp1, temp2, temp3)
alldomains_df_full_anoxic <- data.frame(alldomains_df_full_anoxic)
rownames(alldomains_df_full_anoxic) <- alldomains_df_full_anoxic$New_ID
alldomains_df_full_anoxic <- select(alldomains_df_full_anoxic, -c("ID","New_ID"))
Remove columns with NAs. These are samples for which the library for at least one domain didn’t work (can’t do correlations with missing domains)
alldomains_df_full_anoxic <- alldomains_df_full_anoxic %>%
select_if(~ !any(is.na(.)))
t(alldomains_df_full_anoxic)[1:5,1:5]
denovo231149 Proteobacteria Gammaproteobacteria Chromatiales
AnoxNov2FL 216557
IntNov2FL 197506
AnoxMay2FL 170582
IntMay2FL 252604
IntMay1FL 322116
denovo348086 Deferribacteres Deferribacterales SAR406_clade(Marine_group_A)
AnoxNov2FL 49317
IntNov2FL 74214
AnoxMay2FL 71664
IntMay2FL 42663
IntMay1FL 24278
denovo104772 Proteobacteria Alphaproteobacteria SAR11_clade
AnoxNov2FL 209
IntNov2FL 5059
AnoxMay2FL 77
IntMay2FL 461
IntMay1FL 805
denovo26278 Proteobacteria Gammaproteobacteria E01-9C-26_marine_group
AnoxNov2FL 5459
IntNov2FL 13419
AnoxMay2FL 2348
IntMay2FL 5737
IntMay1FL 3694
denovo290780 Proteobacteria Gammaproteobacteria Oceanospirillales
AnoxNov2FL 1122
IntNov2FL 3734
AnoxMay2FL 600
IntMay2FL 1304
IntMay1FL 1816
11 samples remain for correlation
Pull out samples from shallow anoxic regime
# Pull out anoxic layer bacteria sample IDs
euxinictypes_bac <- metadata %>%
filter(`Sample Name` %in% sample_names(ps_bac_orders)) %>%
filter(OxCond == "Euxinic") %>%
select("Sample Name")
euxinictypes_bac <- unlist(c(unique(euxinictypes_bac)), use.names = FALSE)
# Pull out all bacteria from euxinic layer
ps_bac_euxinic <- prune_samples(euxinictypes_bac, ps_bac_orders)
ps_bac_ra_euxinic <- prune_samples(euxinictypes_bac, ps_bac_ra_orders)
# Pull out euxinic layer archaea sample IDs
euxinictypes_arch <- metadata %>%
filter(`Sample Name` %in% sample_names(ps_arch_classes)) %>%
filter(OxCond == "Euxinic") %>%
select("Sample Name")
euxinictypes_arch <- unlist(c(unique(euxinictypes_arch)), use.names = FALSE)
# Pull out all archaea from euxinic layer
ps_arch_euxinic<- prune_samples(euxinictypes_arch, ps_arch_classes)
ps_arch_ra_euxinic <- prune_samples(euxinictypes_arch, ps_arch_ra_classes)
# Pull out euxinic layer eukaryotic sample IDs
euxinictypes_euk <- metadata %>%
filter(`Sample Name` %in% sample_names(ps_euk_orders)) %>%
filter(OxCond == "Euxinic") %>%
select("Sample Name")
euxinictypes_euk <- unlist(c(unique(euxinictypes_euk)), use.names = FALSE)
# Pull out all eukaryotes from euxinic layer
ps_euk_euxinic <- prune_samples(euxinictypes_euk, ps_euk_orders)
ps_euk_ra_euxinic <- prune_samples(euxinictypes_euk, ps_euk_ra_orders)
Filter out low abundance taxa from the oxycline samples. Use 1% as cutoff since this is a smaller dataset
# Bacteria
x <- taxa_sums(ps_bac_ra_euxinic)
keepTaxa <- x>.01 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_bac_ra_euxinic_pruned <- prune_taxa(keepTaxa, ps_bac_ra_euxinic)
ps_bac_euxinic_pruned <- prune_taxa(keepTaxa, ps_bac_euxinic)
ps_bac_ra_euxinic_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 55 taxa and 7 samples ]
sample_data() Sample Data: [ 7 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 55 taxa by 10 taxonomic ranks ]
ps_bac_euxinic_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 55 taxa and 7 samples ]
sample_data() Sample Data: [ 7 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 55 taxa by 10 taxonomic ranks ]
# Archaea
x <- taxa_sums(ps_arch_ra_euxinic)
keepTaxa <- x>.01 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_arch_ra_euxinic_pruned <- prune_taxa(keepTaxa, ps_arch_ra_euxinic)
ps_arch_euxinic_pruned <- prune_taxa(keepTaxa, ps_arch_euxinic)
ps_arch_ra_euxinic_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 11 taxa and 5 samples ]
sample_data() Sample Data: [ 5 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 11 taxa by 8 taxonomic ranks ]
ps_arch_euxinic_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 11 taxa and 5 samples ]
sample_data() Sample Data: [ 5 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 11 taxa by 8 taxonomic ranks ]
# Eukaryotes
x <- taxa_sums(ps_euk_ra_euxinic)
keepTaxa <- x>.01 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_euk_ra_euxinic_pruned <- prune_taxa(keepTaxa, ps_euk_ra_euxinic)
ps_euk_euxinic_pruned <- prune_taxa(keepTaxa, ps_euk_euxinic)
ps_euk_ra_euxinic_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 18 taxa and 6 samples ]
sample_data() Sample Data: [ 6 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 18 taxa by 8 taxonomic ranks ]
phy_tree() Phylogenetic Tree: [ 18 tips and 17 internal nodes ]
ps_euk_euxinic_pruned
phyloseq-class experiment-level object
otu_table() OTU Table: [ 18 taxa and 6 samples ]
sample_data() Sample Data: [ 6 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 18 taxa by 8 taxonomic ranks ]
phy_tree() Phylogenetic Tree: [ 18 tips and 17 internal nodes ]
55 bacteria, 11 archaea, 18 eukaryota remain
Change the sample names in the otu tables to “Type”
# Archaea
# remove missing archaea samples from samplekey_A
samplekey_A <- filter(samplekey, SampleID_arch %in% colnames(otu_table(ps_arch_ra_euxinic_pruned)))
# sort SampleKey by order of column names from ps_arch_ra_euxinic_pruned
samplekey_A <- samplekey_A %>% arrange(factor(SampleID_arch, levels = colnames(otu_table(ps_arch_ra_euxinic_pruned))))
# replace col names of otu table from ps_arch_ra_euxinic_pruned
sample_names(ps_arch_ra_euxinic_pruned) <- samplekey_A$Type
# and ps_arch_pruned
sample_names(ps_arch_euxinic_pruned) <- samplekey_A$Type
# Bacteria
samplekey_B <- filter(samplekey, SampleID_bac %in% colnames(otu_table(ps_bac_ra_euxinic_pruned)))
samplekey_B <- samplekey_B %>% arrange(factor(SampleID_bac, levels = colnames(otu_table(ps_bac_ra_euxinic_pruned))))
sample_names(ps_bac_ra_euxinic_pruned) <- samplekey_B$Type
sample_names(ps_bac_euxinic_pruned) <- samplekey_B$Type
# Eukaryotes
samplekey_E <- filter(samplekey, SampleID_euk %in% colnames(otu_table(ps_euk_ra_euxinic_pruned)))
samplekey_E <- samplekey_E %>% arrange(factor(SampleID_euk, levels = colnames(otu_table(ps_euk_ra_euxinic_pruned))))
sample_names(ps_euk_ra_euxinic_pruned) <- samplekey_E$Type
sample_names(ps_euk_euxinic_pruned) <- samplekey_E$Type
Move all pruned otu tables into one table by matching the samples names (“Type”)
alldomains_df_euxinic <- bind_rows(data.frame(otu_table(ps_bac_euxinic_pruned)), data.frame(otu_table(ps_arch_euxinic_pruned)), data.frame(otu_table(ps_euk_euxinic_pruned)))
alldomains_df_euxinic
Change row names from “denovoXXX” to meaningful names
alldomains_df_full_euxinic <- cbind(ID = rownames(alldomains_df_euxinic), alldomains_df_euxinic)
# start with only first 76 rows, which are bacteria. make one column of meaningful labels
temp1 <- left_join(alldomains_df_full_euxinic[1:55,], bac_taxonomy, by = c("ID" = "#OTU ID"))
temp1$New_ID <- paste(temp1$ID, temp1$"taxonomy-2", temp1$"taxonomy-3", temp1$"taxonomy-4")
temp1 <- select(temp1,-colnames(bac_taxonomy[,2:11]))
# next 7 rows are the archaea
temp2 <- left_join(alldomains_df_full_euxinic[56:66,], arch_taxonomy, by = c("ID" = "#OTU ID"))
temp2$New_ID <- paste(temp2$ID, temp2$"taxonomy-2", temp2$"taxonomy-3")
temp2 <- select(temp2,-colnames(arch_taxonomy[,2:9]))
# last 28 rows are eukarya
euk_taxonomy <- cbind("#ASV ID" = rownames(taxonomy), taxonomy)
temp3 <- left_join(alldomains_df_full_euxinic[67:84,], euk_taxonomy, by = c("ID" = "#ASV ID"))
temp3$New_ID <- paste(temp3$ID, temp3$"Supergroup", temp3$"Division", temp3$"Class", temp3$"Order")
temp3 <- select(temp3,-colnames(euk_taxonomy[,2:9]))
# combine back all 3 domains, with new names as row names in a dataframe
alldomains_df_full_euxinic <- rbind(temp1, temp2, temp3)
alldomains_df_full_euxinic <- data.frame(alldomains_df_full_euxinic)
rownames(alldomains_df_full_euxinic) <- alldomains_df_full_euxinic$New_ID
alldomains_df_full_euxinic <- select(alldomains_df_full_euxinic, -c("ID","New_ID"))
Remove columns with NAs. These are samples for which the library for at least one domain didn’t work (can’t do correlations with missing domains)
alldomains_df_full_euxinic <- alldomains_df_full_euxinic %>%
select_if(~ !any(is.na(.)))
t(alldomains_df_full_euxinic)[1:4,1:4]
denovo348086 Deferribacteres Deferribacterales SAR406_clade(Marine_group_A)
DeepMay1FL 121125
DeepNov1PA 16299
DeepMay2PA 9668
DeepMay2FL 137256
denovo260142 Proteobacteria Betaproteobacteria Burkholderiales
DeepMay1FL 384
DeepNov1PA 17537
DeepMay2PA 3092
DeepMay2FL 100
denovo340 Proteobacteria Deltaproteobacteria SAR324_clade(Marine_group_B)
DeepMay1FL 540
DeepNov1PA 1270
DeepMay2PA 1801
DeepMay2FL 730
denovo463083 Proteobacteria Gammaproteobacteria Oceanospirillales
DeepMay1FL 1043
DeepNov1PA 1248
DeepMay2PA 148
DeepMay2FL 160
4 samples remain for correlation
This is largely based on BVCN tutorials NOTE- input for SparCC should be raw count data (after filtering out low-abundance ASVs). The function does a log-ratio transformation to account for compositionality
sparcctable_alldomains <- sparcc(t(alldomains_df))
Put sample names back into result tables
rownames(sparcctable_alldomains$Cor) <- rownames(alldomains_df_full)
colnames(sparcctable_alldomains$Cor) <- rownames(alldomains_df_full)
rownames(sparcctable_alldomains$Cov) <- rownames(alldomains_df_full)
colnames(sparcctable_alldomains$Cov) <- rownames(alldomains_df_full)
sparcctable_alldomains$Cor[1:2,1:2]
denovo231149 Proteobacteria Gammaproteobacteria Chromatiales
denovo231149 Proteobacteria Gammaproteobacteria Chromatiales 1.000000
denovo348086 Deferribacteres Deferribacterales SAR406_clade(Marine_group_A) 0.505872
denovo348086 Deferribacteres Deferribacterales SAR406_clade(Marine_group_A)
denovo231149 Proteobacteria Gammaproteobacteria Chromatiales 0.505872
denovo348086 Deferribacteres Deferribacterales SAR406_clade(Marine_group_A) 1.000000
Plot correlation
plotableSparcc <- sparcctable_alldomains$Cor %>% reorder_cormat %>% get_upper_tri() %>% reshape2::melt() %>% na.omit()
Sparcc_plot <- plotableSparcc %>% ggplot(aes(x = Var2, y = Var1, fill = value)) + geom_tile() + scale_fill_gradient2() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
Sparcc_plot
ggsave("figures/sparcc_corr_alldomains.eps",Sparcc_plot, width = 20, height = 20, units = c("in"))
Calculate Sparcc p-values by bootstrapping- TAKES A LONG TIME
tp0 <- proc.time()
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
4: In readChar(file, size, TRUE) : truncating string with embedded nuls
5: In readChar(file, size, TRUE) : truncating string with embedded nuls
6: In readChar(file, size, TRUE) : truncating string with embedded nuls
7: In readChar(file, size, TRUE) : truncating string with embedded nuls
8: In readChar(file, size, TRUE) : truncating string with embedded nuls
out2 <- sparccboot(t(alldomains_df), R = 1000, ncpus = 2)
tp1 <- proc.time()
tp1 - tp0
user system elapsed
4523.981 515.089 2565.597
The above took <2 hours to run 1000 iterations
Extract p-values
outP <- pval.sparccboot(out2)
data.frame(outP$cors, outP$pvals) %>% head
cors <- outP$cors
pvals <- outP$pvals
sparCCpcors <- diag(0.5, nrow = dim(sparcctable_alldomains$Cor)[1], ncol = dim(sparcctable_alldomains$Cor)[1])
sparCCpcors[upper.tri(sparCCpcors, diag=FALSE)] <- cors
sparCCpcors <- sparCCpcors + t(sparCCpcors)
sparCCpval <- diag(0.5, nrow = dim(sparcctable_alldomains$Cor)[1], ncol = dim(sparcctable_alldomains$Cor)[1])
sparCCpval[upper.tri(sparCCpval, diag=FALSE)] <- pvals
sparCCpval <- sparCCpval + t(sparCCpval)
rownames(sparCCpcors) <- rownames(alldomains_df_full)
colnames(sparCCpcors) <- rownames(alldomains_df_full)
rownames(sparCCpval) <- rownames(alldomains_df_full)
colnames(sparCCpval) <- rownames(alldomains_df_full)
sparCCpcors[1:2, 1:2]
denovo231149 Proteobacteria Gammaproteobacteria Chromatiales
denovo231149 Proteobacteria Gammaproteobacteria Chromatiales 1.000000
denovo348086 Deferribacteres Deferribacterales SAR406_clade(Marine_group_A) 0.505834
denovo348086 Deferribacteres Deferribacterales SAR406_clade(Marine_group_A)
denovo231149 Proteobacteria Gammaproteobacteria Chromatiales 0.505834
denovo348086 Deferribacteres Deferribacterales SAR406_clade(Marine_group_A) 1.000000
sparCCpval[1:2, 1:2]
denovo231149 Proteobacteria Gammaproteobacteria Chromatiales
denovo231149 Proteobacteria Gammaproteobacteria Chromatiales 1.000
denovo348086 Deferribacteres Deferribacterales SAR406_clade(Marine_group_A) 0.002
denovo348086 Deferribacteres Deferribacterales SAR406_clade(Marine_group_A)
denovo231149 Proteobacteria Gammaproteobacteria Chromatiales 0.002
denovo348086 Deferribacteres Deferribacterales SAR406_clade(Marine_group_A) 1.000
Reorder for plotting
reordered_all_sparcc <- reorder_cor_and_p(sparCCpcors, sparCCpval)
reordered_sparccCor <- reordered_all_sparcc$r
reordered_sparccP<- reordered_all_sparcc$p
sparccCor_processed <- reordered_sparccCor %>% get_upper_tri() %>% reshape2::melt() %>% na.omit() %>% rename(cor = value)
sparccP_processed <- reordered_sparccP %>% get_upper_tri() %>% reshape2::melt() %>% na.omit() %>% rename(p = value)
# join the two data frames
SparccP <- left_join(sparccCor_processed, sparccP_processed, by = c("Var1", "Var2")) %>%
# # remove self correlations
# filter(Var1 != Var2) %>%
# calculate the false discovery rate to adjust for multiple p values
mutate(fdr = p.adjust(p, method = "BH"))
And plot correlation with p-values. Circles mean that the relationship is sig. at p = 0.05 level, based on bootstrapping
fdrThresh <- 0.01 # fdr threshold
sparccOkP <- SparccP%>% filter(fdr < fdrThresh)
SparccP_plot <- SparccP %>% ggplot(aes(x = Var2, y = Var1, fill = cor)) + geom_tile() + scale_fill_gradient2() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + geom_point(data = sparccOkP, shape = 1)
SparccP_plot
ggsave("figures/sparcc_corr_alldomains_w_pvals.eps",SparccP_plot, width = 20, height = 20, units = c("in"))
Save environment again
# save.image("EnvironmentBackups/CariacoEuks_postanalysis_vars_upto_sparcc_bootstrap.RData")
Or load if coming back
load("EnvironmentBackups/CariacoEuks_postanalysis_vars_upto_sparcc_bootstrap.RData")
Try the glasso method, as described in the spieceasi github and BVCN lessons 1.2. This reduces the clumps (eg. correlations that are secondary or teriary, not direct relationships)
Make functions from tutorial
convertSEToTable <- function(se_out,sp.names){
There were 18 warnings (use warnings() to see them)
#This is just a fancy helper function to get the data in a comparable format to the output of lesson 1 so we can make a similar plot. We will cover other methods for visualizing this type of output in future lessons.
secor <- cov2cor(as.matrix(getOptCov(se_out))) # See spieceasi documentation for how to pull out weights for comparison
elist <- summary(triu(secor*getRefit(se_out), k=1))
elist[,1] <- sp.names[elist[,1]]
elist[,2] <- sp.names[elist[,2]]
elist[,4] <- paste(elist[,1],elist[,2])
full_e <- expand.grid(sp.names,sp.names)
rownames(full_e) <- paste(full_e[,1],full_e[,2])
full_e[,"Weight"] <- 0
full_e[elist[,4],"Weight"] <- elist[,3]
x <- expand.grid(1:length(sp.names),1:length(sp.names))
full_e[x[,"Var1"]>x[,"Var2"],"Weight"] <- NA
return(as.data.frame(full_e,stringsAsFactors=F))
}
Follow the spieceasi documentation to find optimal parameters
#Run Spieceasi
pargs <- list(seed=10010)
se <- spiec.easi(t(alldomains_df), method='glasso', lambda.min.ratio=5e-1,
nlambda=100, pulsar.params=list(rep.num=50))
Applying data transformations...
Selecting model with pulsar using stars...
Fitting final estimate with glasso...
done
the above takes a couple of minutes to run
#This is just a fancy helper function to get the data in a comparable format to the output of lesson 1 so we can make a similar plot. We will cover other methods for visualizing this type of output in future lessons.
tab.se <- convertSEToTable(se,sp.names=colnames(t(alldomains_df_full)))
#Plot using ggplot as in lesson 1
plot.se <- ggplot(tab.se,aes(x = Var1, y = Var2, fill = Weight)) + geom_tile() + scale_fill_gradient2() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot(plot.se)
ggsave("figures/glasso_corr_alldomains.eps",plot.se, width = 20, height = 20, units = c("in"))
Note- only the significant values above show up in the heatmap above (ie. there is no “p-value”)
Build using igraph, following BVCN lesson 2.2
#Extract adjacency matrix from spiecEasi output
adj.mat <- getRefit(se)
table(as.numeric(adj.mat))
0 1
11340 760
# Extract weighted adjacency
se.cor <- cov2cor(as.matrix(getOptCov(se)))
weighted.adj.mat <- se.cor*getRefit(se)
#Let's take a loot at that adjacency matrix
heatmap(as.matrix(weighted.adj.mat))
Convert to graph objects
grph.unweighted <- adj2igraph(adj.mat)
grph <- adj2igraph(weighted.adj.mat)
Plot weighted graph
plot(grph,vertex.size=1,
vertex.label=NA,
edge.width=1,
layout=layout.circle(grph))
Put back in species names
V(grph)$name <- rownames(alldomains_df_full)
V(grph)
+ 110/110 vertices, named, from d9969e0:
[1] denovo231149 Proteobacteria Gammaproteobacteria Chromatiales
[2] denovo348086 Deferribacteres Deferribacterales SAR406_clade(Marine_group_A)
[3] denovo104772 Proteobacteria Alphaproteobacteria SAR11_clade
[4] denovo26278 Proteobacteria Gammaproteobacteria E01-9C-26_marine_group
[5] denovo290780 Proteobacteria Gammaproteobacteria Oceanospirillales
[6] denovo258351 Proteobacteria Gammaproteobacteria Oceanospirillales
[7] denovo408458 Proteobacteria Deltaproteobacteria Desulfobacterales
[8] denovo260142 Proteobacteria Betaproteobacteria Burkholderiales
[9] denovo139931 Proteobacteria Alphaproteobacteria SAR11_clade
[10] denovo340 Proteobacteria Deltaproteobacteria SAR324_clade(Marine_group_B)
+ ... omitted several vertices
Make size of nodes proportional to degree (number of connections)
V(grph)$size <- (degree(grph) + 1) # the +1 is to avoid size zero vertices
V(grph)$color <- "black"
plot(grph,
vertex.label=NA,
layout=layout.circle(grph))
Color by connection (positive or negative) and changes width of edges to be proportional to their weights, Scale node sizes to be smaller
E(grph)$color <- custombluegreen
E(grph)$color[E(grph)$weight<0] <- customreddishpurple
E(grph)$width <- abs(E(grph)$weight)*10
V(grph)$size <- V(grph)$size/2
plot(grph,
vertex.label=NA,
layout=layout.circle(grph))
Remove low-weight edges (you decide what threshold is right for your network):
# weight_threshold <- 0.05
# grph <- delete.edges(grph,which(abs(E(grph)$weight)<weight_threshold))
# plot(grph,
# vertex.label=NA,
# layout=layout.circle(grph))
Plot only pos interactions, delete nodes that are not connected to anything
#Remove negative edges
grph.pos <- delete.edges(grph,which(E(grph)$weight<0))
grph.pos <- delete.vertices(grph.pos,which(degree(grph.pos)<1))
plot(grph.pos,
vertex.label=NA)
# Layout with fruchterman-reingold algorithm
plot(grph.pos,
vertex.label=NA,
layout=layout_with_fr(grph.pos))
Check out remaining nodes and highlight interesting ones
# Color the syndiniales (only group V remaining)
V(grph.pos)$color[V(grph.pos)$name=="ASV_232 Alveolata Dinoflagellata Syndiniales Dino-Group-V"] <- custombluegreen
e.Synd <- E(grph.pos)[from("ASV_232 Alveolata Dinoflagellata Syndiniales Dino-Group-V")]
Error in as.igraph.vs(graph, v) : Invalid vertex names
Plot only neg interactions
#Remove negative edges
grph.neg <- delete.edges(grph,which(E(grph)$weight>0))
grph.neg <- delete.vertices(grph.neg,which(degree(grph.neg)<1))
plot(grph.neg,
vertex.label=NA)
# Layout with fruchterman-reingold algorithm
plot(grph.neg,
vertex.label=NA,
layout=layout_with_fr(grph.neg))
Need to constrain the size of the count tables more, otherwise the matrix algebra during ordinations overwhelems R.
McMurdie and Holmes (2013) filter out taxa that were not seen with more than 3 counts in at least 20% of the samples. We also want to add a pseduocount of 1 to all counts. This is so that later when we do different calculations (log, division, etc) we don’t get back errors due to zeroes
ps_filtered = filter_taxa(ps, function(x) sum(x > 3) > (0.2*length(x)), TRUE)
ps_filtered <- transform_sample_counts(ps_filtered, function(x) x+1)
# Also make a filtered version of the relative abundance count table (for plotting purposes)
ps_ra_filtered <- prune_taxa(taxa_names(ps_filtered),ps_ra) # prune from ps_ra object (relative abundances)
# check number of ASVs in each
ps
ps_filtered
ps_ra_filtered
That reduced the otu table from 13,909 ASVs to 1395 ASVs
Check remaining phyla
table(tax_table(ps_filtered)[, "phylum"], exclude = NULL)
Check by plotting phyla again
# There are many phyla here, so have to make a custom color palette by interpolating from an existing one in RColorBrewer
colourCount = length(table(tax_table(ps_ra_filtered)[, "phylum"], exclude = NULL))
getPalette = colorRampPalette(brewer.pal(11, "Spectral"))
PhylaFilteredPalette = getPalette(colourCount)
phylum_filtered_Glommed_RA = tax_glom(ps_ra_filtered, "phylum")
# and plot
phyla_barplot_filtered <- plot_bar(phylum_filtered_Glommed_RA, x = "Sample", fill = "phylum") +
scale_fill_manual(values = PhylaFilteredPalette) +
theme(legend.text = element_text(size = 10))
phyla_barplot_filtered
# export
ggsave("figures/phyla_barplot_filtered.eps",phyla_barplot_filtered, width = 15, height = 5, units = c("in"))
Lastly, reroot-the tree affiliated with the ps_filtered and ps_ra_filtered dataset, since the root may have been removed when filtering
# first define function from link above to find furthest outgroup
pick_new_outgroup <- function(tree.unrooted){
require("magrittr")
require("data.table")
require("ape") # ape::Ntip
# tablify parts of tree that we need.
treeDT <-
cbind(
data.table(tree.unrooted$edge),
data.table(length = tree.unrooted$edge.length)
)[1:Ntip(tree.unrooted)] %>%
cbind(data.table(id = tree.unrooted$tip.label))
# Take the longest terminal branch as outgroup
new.outgroup <- treeDT[which.max(length)]$id
return(new.outgroup) }
# then run on my phyloseq tree
my.tree <- phy_tree(ps_filtered)
out.group <- pick_new_outgroup(my.tree)
out.group
# Then use this outgroup to root the tree
new.tree1 <- ape::root(my.tree, outgroup=out.group, resolve.root=TRUE)
phy_tree(ps_filtered) <- new.tree1
phy_tree(ps_ra_filtered) <- new.tree1
# Check if tree is binary (not multichotomous)
is.binary.tree(phy_tree(ps_filtered))
Now move onto ordinations using this filtered dataset
based on Coenen et al. tutorials for clustering. See repo
# Estimate covariance matrix for OTUs
covariance_matrix <- as.matrix(otu_table(ps_filtered)) %*% t(otu_table(ps_filtered))
# %*% = matrix multiplication sign in R; used here to multiply OTU/ASV data matrix to itself to estimate covariance.
# Evaluate determinant of covariance matrix
cov_determinant <- det(covariance_matrix)
cov_determinant
The determinant of the covariance matrix (what we just calculated) is equivalent to the product of the proportion of variance explained by every PCA axis. If the determinant is 0, that means there is an axis which explains 0 variance that we can’t separate from the other axes. This means the data need to be transformed to be suitable for PCA.
PCA is essentially a type of PCoA using the Euclidean distance matrix as input. When combined with a log-ratio transformation of the count table, this is deemed appropriate for compositional datasets.
First do a CLR, centered log ratio transformation of the absolute abundance data (after filtering), as suggested by Gloor et al. 2017 and check the determinant of this matrix. Compare it to the determinant without any transformation. {Also commented out for now because takes a few minutes}
# Estimate covariance matrix for absolute abundance ASV table
covariance_matrix <- as.matrix(otu_table(ps_filtered)) %*% t(otu_table(ps_filtered))
# %*% = matrix multiplication sign in R; used here to multiply OTU/ASV data matrix to itself to estimate covariance.
# Evaluate determinant of covariance matrix
cov_determinant <- det(covariance_matrix)
# Estimate covariance matrix for CLR-transformed ASV table
clr_asv_table_ps_filtered <- data.frame(compositions::clr(t(otu_table(ps_filtered))))
## Check new determinant of clr transformed table
new_covdet <- det(as.matrix(clr_asv_table_ps_filtered) %*% t(clr_asv_table_ps_filtered))
# Compare
cov_determinant #Original Count Data
new_covdet # New
The determinant of a covariance matrix = the product of the proportion of variance explained by every PCA axis. If one axis is zero, the product (determinant) is 0, and we can’t separate that axis from the other axes. Thus, these data require transformation in order to be suitable for ordinations. The determinant of the CLR-transformed table is not zero, so we can proceed with PCA of the CLR-transformed data.
Generate the PCA and visualize axes
# Generate a Principle Component Analysis (PCA) and evaluated based on the eigen decomposition from sample covariance matrix.
lograt_pca <- prcomp(clr_asv_table_ps_filtered)
# NOTE- this is equivalent to first making a Euclidean distance matrix using the CLR data table and then running a PCoA. A Euclidean distance matrix of a log-transformed data table = an Aitchison distance matrix. So this is equivalent to the compositional methods listed in Gloor et al.
# Visual representation with a screeplot
lograt_variances <- as.data.frame(lograt_pca$sdev^2/sum(lograt_pca$sdev^2)) %>% #Extract axes
# Format to plot
select(PercVar = 'lograt_pca$sdev^2/sum(lograt_pca$sdev^2)') %>%
rownames_to_column(var = "PCaxis") %>%
data.frame
head(lograt_variances)
# Plot screeplot
ggplot(lograt_variances, aes(x = as.numeric(PCaxis), y = PercVar)) +
geom_bar(stat = "identity", fill = "grey", color = "black") +
theme_minimal() +
theme(axis.title = element_text(color = "black", face = "bold", size = 10),
axis.text.y = element_text(color = "black", face = "bold"),
axis.text.x = element_blank()) +
labs(x = "PC axis", y = "% Variance", title = "Log-Ratio PCA Screeplot, CLR Tranformation")
A faithful representation is to plot this ordination in 2D because the 3rd and 4th axes appear very similar, while the 1st and 2nd explain a decently large proportion of variance (23.8 + 19.7 = 43.5%)
Visualize the PCA
# lograt_pca$x #View PC values
pca_lograt_frame <- data.frame(lograt_pca$x) %>%
rownames_to_column(var = "#SampleID")
# Merge metadata into the pca data table
pca_lograt_frame <- left_join(pca_lograt_frame, metadata, by = "#SampleID")
head(pca_lograt_frame)
# Plot PCA with Salinity
pca_lograt_salinity <- ggplot(pca_lograt_frame, aes(x = PC1, y = PC2)) +
geom_point(aes(color = Salinity, shape = Size_fraction), size = 4) +
geom_text(aes(label = `#SampleID`), size = 3, hjust = 0, vjust = 0) +
ylab(paste0('PC2 ', round(lograt_variances[2,2]*100,2),'%')) + #Extract y axis value from variance
xlab(paste0('PC1 ', round(lograt_variances[1,2]*100,2),'%')) + #Extract x axis value from variance
scale_color_distiller(palette = "Spectral", direction = 1, limits = c(min(sample_data(ps_filtered)$Salinity), max(sample_data(ps_filtered)$Salinity)), name = 'Salinity') +
ggtitle('CLR-Euclidean PCA') +
coord_fixed(ratio = 1) +
theme_bw()
pca_lograt_salinity
ggsave("figures/pca_lograt_salinity.eps",pca_lograt_salinity, width = 7, height = 5, units = c("in"))
# Plot PCA with Temperature
pca_lograt_temp <- ggplot(pca_lograt_frame, aes(x = PC1, y = PC2)) +
geom_point(aes(color = Temperature, shape = Size_fraction), size = 4) +
geom_text(aes(label = `#SampleID`), size = 3, hjust = 0, vjust = 0) +
ylab(paste0('PC2 ', round(lograt_variances[2,2]*100,2),'%')) + #Extract y axis value from variance
xlab(paste0('PC1 ', round(lograt_variances[1,2]*100,2),'%')) + #Extract x axis value from variance
scale_color_distiller(palette = "Spectral", direction = 1, limits = c(min(sample_data(ps_filtered)$Temperature), max(sample_data(ps_filtered)$Temperature)), name = 'Temperature') +
ggtitle('CLR-Euclidean PCA') +
coord_fixed(ratio = 1) +
theme_bw()
pca_lograt_temp
ggsave("figures/pca_lograt_temp.eps",pca_lograt_temp, width = 7, height = 5, units = c("in"))
The CLR-Euclidean PCA reveals a separation among samples driven in part by salinity and temperature, except for samples N1 and N2 (all the way on the bottom left) which cluster differently. The samples from the same location and different size fractions are similar in most cases. (Eg. R1/R2 are close to G1/2, etc)
The compositional approach to ordinations that takes into account phylogenetic relatedness is PhILR, which uses the ILR transformation and calculates a weighted abundance (“balance”). It also scales the balances by phylogenetic distances, therefore taking into account phylogenetic relatedness (similar to Unifrac).
Based this on the Silverman et al. tutorial.
Prepare the data and load the philr package. (Note: I did not load this this earlier because there is an incompatibility with phyloseq and one of the philr dependencies and it spits out a warning every time I ran anything in phyloseq. I wanted to limit that printing out in the markdown file. The red text can be ignored).
# Make node labels in tree: n1, n2, etc.
phy_tree(ps_filtered) <- makeNodeLabel(phy_tree(ps_filtered), method="number", prefix='n')
# Load philr
library(philr)
# Name the balances (see Fig. 1 from Silverman et al. for depiction of "balance")
name.balance(phy_tree(ps_filtered), tax_table(ps_filtered), 'n1')
Extract the individual components from the phyloseq object and take a look
otu.table2 <- t(otu_table(ps_filtered))
tree2 <- phy_tree(ps_filtered)
metadata2 <- sample_data(ps_filtered)
tax2 <- tax_table(ps_filtered)
otu.table2[1:5,1:5] # ASV Table
tree2 # Phylogenetic Tree
head(metadata2,2) # Metadata
head(tax2,5) # taxonomy table
Next, pass the otu table, etc into PhILR, which will convert the tree to a sequential binary partition representation, calculate the weights of the taxa, build the contrast matrix, convert the OTU table to relative abundance (NOTE: you should pass absolute abundances into this function because it calculates the RA within it), transform using the phylogenetic-ILR transformation, and weigh by phylogenetic distances
ps.philr <- philr(otu.table2, tree2,
part.weights='enorm.x.gm.counts',
ilr.weights='blw.sqrt')
Check output
ps.philr[1:5,1:5]
Now the “OTU table” is presented with samples as rows and balances as columns. Each balance is associated with a node of the tree, n1, n2, etc.
Next do the ordination using a Euclidean distance matrix and check the screeplot
ps.philr.dist.euc <- dist(ps.philr, method="euclidean")
ps.philr.pcoa <- ordinate(ps_filtered, 'PCoA', distance=ps.philr.dist.euc)
# Extract variances from pcoa, from jaccard calculated dist. metric
philr_variances <- data.frame(ps.philr.pcoa$values$Relative_eig) %>%
select(PercVar = 'ps.philr.pcoa.values.Relative_eig') %>%
rownames_to_column(var = "PCaxis") %>%
data.frame
head(philr_variances)
# Make a screeplot
ggplot(philr_variances, aes(x = as.numeric(PCaxis), y = PercVar)) +
geom_bar(stat = "identity", fill = "grey", color = "black") +
theme_minimal() +
theme(axis.title = element_text(color = "black", face = "bold", size = 10),
axis.text.y = element_text(color = "black", face = "bold"),
axis.text.x = element_blank()) +
labs(x = "PC axis", y = "% Variance", title = "PhILR PCoA Screeplot")
Based on above, it’s fair to plot the first two axes. This would be 39.1+19.8 = 58.9% total variance
Plot the ordination using phyloseq syntax
# Plot with salinity
ps.philr.pcoa.plot.salinity <- plot_ordination(ps_filtered, ps.philr.pcoa, shape="Size_fraction", color='Salinity') + geom_point(size = 4) +
scale_color_distiller(palette = "Spectral", direction = 1, limits = c(min(sample_data(ps_filtered)$Salinity), max(sample_data(ps_filtered)$Salinity)), name = 'Salinity') +
geom_text(aes(label = X.SampleID), size = 3, hjust = 0, vjust = 0, , color = "black") +
ggtitle('PhILR PCoA Ordination') +
coord_fixed(ratio = 1) +
theme_bw()
ps.philr.pcoa.plot.salinity
ggsave("figures/ps.philr.pcoa.plot.salinity.eps",ps.philr.pcoa.plot.salinity, width = 7, height = 5, units = c("in"))
# Plot with Temperature
ps.philr.pcoa.plot.temp <- plot_ordination(ps_filtered, ps.philr.pcoa, shape="Size_fraction", color='Temperature') + geom_point(size = 4) +
scale_color_distiller(palette = "Spectral", direction = 1, limits = c(min(sample_data(ps_filtered)$Temperature), max(sample_data(ps_filtered)$Temperature)), name = 'Temperature') +
geom_text(aes(label = X.SampleID), size = 3, hjust = 0, vjust = 0, , color = "black") +
ggtitle('PhILR PCoA Ordination') +
coord_fixed(ratio = 1) +
theme_bw()
ps.philr.pcoa.plot.temp
ggsave("figures/ps.philr.pcoa.plot.temp.eps",ps.philr.pcoa.plot.temp, width = 7, height = 5, units = c("in"))
The plot above is very similar to the PCA with a minor difference: there is more distance separation among the PA/Fl samples along axis 1. Because this ordination is weighted by phylogenetic relatedness, this means that differences between PA and FL are driven by abundances of taxa that are phylogenetically distinct (ie. not closely related)
The more traditional approach to ordinations is to do a PCoA on a distance matrix such as Bray-Curtis, Jaccard, or Unifrac. While these are not considered compositional approaches, when combined with pre-treatment (transformations) they become more appropriate. One such transformation that I will use here is the Hellinger transformation.
The different distance matrices also tell you a few different things about the dataset so I will run through this to try to see if I can tease those out.
Before calculating any distance matrix, do a transformation of the filtered count table. Hellinger transformation is the square root of the relative abundance, so calculate it based on the ps_ra_filtered object:
ps_hellinger <- transform_sample_counts(ps_ra_filtered, function(x){sqrt(x)})
First, Jaccard, which builds the distance matrix based on presence/absence between samples. It does not take into account relative abundance of the taxa. Therefore this functions well for determining differences driven by rare taxa, which are weighed the same as abundant taxa.
jac_dmat<-vegdist(t(otu_table(ps_hellinger)),method="jaccard") # Jaccard dist metric
pcoa_jac<-pcoa(jac_dmat) # perform PCoA
# Extract variances from pcoa, from jaccard calculated dist. metric
jac_variances <- data.frame(pcoa_jac$values$Relative_eig) %>%
select(PercVar = 'pcoa_jac.values.Relative_eig') %>%
rownames_to_column(var = "PCaxis") %>%
data.frame
head(jac_variances)
# Make a screeplot
ggplot(jac_variances, aes(x = as.numeric(PCaxis), y = PercVar)) +
geom_bar(stat = "identity", fill = "grey", color = "black") +
theme_minimal() +
theme(axis.title = element_text(color = "black", face = "bold", size = 10),
axis.text.y = element_text(color = "black", face = "bold"),
axis.text.x = element_blank()) +
labs(x = "PC axis", y = "% Variance", title = "Jaccard PCoA Screeplot")
The first two axes (22.5 + 17.1 = 39.6%) pretty good. But I am going to experiment and plot the first 3 axes since the 2nd and 3rd explain a similar amount of variance, 17.1 and 11.3% respectively
Plot in 3D with Plotly
# Extract variances from the jaccard pcoa
pcoa_jac_df <- data.frame(pcoa_jac$vectors) %>%
rownames_to_column(var = "#SampleID")
# Merge metadata into the pcoa data table
pcoa_jac_df <- left_join(pcoa_jac_df, metadata, by = "#SampleID")
head(pcoa_jac_df)
# Select eigenvalues from dataframe, round to 4 places and multiply by 100 for plotting. These will be the axes for the 3-D plot
eigenvalues<-round(jac_variances[,2], digits = 4)*100
# Plotly - 3-D
pcoa_jaccard <- plot_ly(pcoa_jac_df, type='scatter3d', mode='markers',
x=~Axis.2,y=~Axis.3,z=~Axis.1,colors=~brewer.pal(11,'Spectral'),
color=~Salinity, symbols = c('circle','diamond-open'), symbol=~Size_fraction)%>%
layout(font=list(size=12),
title='PCoA Jaccard Distance',
scene=list(xaxis=list(title=paste0('Co 2 ',eigenvalues[2],'%'),
showticklabels=FALSE,zerolinecolor='black'),
yaxis=list(title=paste0('Co 3 ',eigenvalues[3],'%'),
showticklabels=FALSE,zerolinecolor='black'),
zaxis=list(title=paste0('Co 1 ',eigenvalues[1],'%'),
showticklabels=FALSE,zerolinecolor='black')))
pcoa_jaccard
withr::with_dir('Figures', htmlwidgets::saveWidget(as_widget(pcoa_jaccard), file="pcoa_jaccard.html"))
The Jaccard-PCoA is still driven by differences in salinity/temperature. Similar to the PCA and PhILR results, there are a couple of samples (green diamonds) that are clustering away from that pattern along PC1. There is some slight separation of Total/Pfree pairs in most cases.
Next, try a Bray-Curtis distance matrix with PCoA, which builds the distance matrix based on presence/absence between samples and relative abundance differences. This ordination will represent well the differences in samples that are driven by taxa with high relative abundances.
bray_dmat<-vegdist(t(otu_table(ps_hellinger)),method="bray") # Bray-Curtis dist metric
pcoa_bray<-pcoa(bray_dmat) # perform PCoA
# Extract variances from pcoa, from jaccard calculated dist. metric
bray_variances <- data.frame(pcoa_bray$values$Relative_eig) %>%
select(PercVar = 'pcoa_bray.values.Relative_eig') %>%
rownames_to_column(var = "PCaxis") %>%
data.frame
head(bray_variances)
# Make a screeplot
ggplot(bray_variances, aes(x = as.numeric(PCaxis), y = PercVar)) +
geom_bar(stat = "identity", fill = "grey", color = "black") +
theme_minimal() +
theme(axis.title = element_text(color = "black", face = "bold", size = 10),
axis.text.y = element_text(color = "black", face = "bold"),
axis.text.x = element_blank()) +
labs(x = "PC axis", y = "% Variance", title = "Bray-Curtis PCoA Screeplot")
The first two axes (32.6 + 21.9 = 54.5%) are pretty good again but I am still going to experiment in the plot with the 3rd axis (12.6% variance)
Plot in 3D with Plotly
# Extract variances from the jaccard pcoa
pcoa_bray_df <- data.frame(pcoa_bray$vectors) %>%
rownames_to_column(var = "#SampleID")
# Merge metadata into the pcoa data table
pcoa_bray_df <- left_join(pcoa_bray_df, metadata, by = "#SampleID")
head(pcoa_bray_df)
# Select eigenvalues from dataframe, round to 4 places and multiply by 100 for plotting. These will be the axes for the 3-D plot
eigenvalues<-round(bray_variances[,2], digits = 4)*100
# Plotly - 3-D
pcoa_bray <- plot_ly(pcoa_bray_df, type='scatter3d', mode='markers',
x=~Axis.2,y=~Axis.3,z=~Axis.1,colors=~brewer.pal(11,'Spectral'),
color=~Salinity, symbols = c('circle','diamond-open'), symbol=~Size_fraction)%>%
layout(font=list(size=12),
title='PCoA Bray-Curtis Distance',
scene=list(xaxis=list(title=paste0('Co 2 ',eigenvalues[2],'%'),
showticklabels=FALSE,zerolinecolor='black'),
yaxis=list(title=paste0('Co 3 ',eigenvalues[3],'%'),
showticklabels=FALSE,zerolinecolor='black'),
zaxis=list(title=paste0('Co 1 ',eigenvalues[1],'%'),
showticklabels=FALSE,zerolinecolor='black')))
pcoa_bray
withr::with_dir('Figures', htmlwidgets::saveWidget(as_widget(pcoa_bray), file="pcoa_bray.html"))
These results are almost identical to those from the Jaccard, so the influence of abundant and rare taxa on differences among samples seems to be the same. This is maybe not surprising since I did filter out many of the very low abundant taxa in order to do the ordinations.
Next try a Weighted Unifrac distance matrix with PCoA. Unifrac prioritizes differences in relatedness (of taxa) among sample. Using weighted unifrac means we are also taking into account changes in relative abundance between samples.
This should be done with the phyloseq function, ordinate, which has a unifrac distance method. vegdist does not.
out.wuf.log <- ordinate(ps_hellinger, method = "PCoA", distance = "wunifrac")
evals <- out.wuf.log$values$Eigenvalues
# Extract variances from pcoa, from jaccard calculated dist. metric
wuf_variances <- data.frame(out.wuf.log$values$Relative_eig) %>%
select(PercVar = 'out.wuf.log.values.Relative_eig') %>%
rownames_to_column(var = "PCaxis") %>%
data.frame
head(wuf_variances)
# Make a screeplot
ggplot(wuf_variances, aes(x = as.numeric(PCaxis), y = PercVar)) +
geom_bar(stat = "identity", fill = "grey", color = "black") +
theme_minimal() +
theme(axis.title = element_text(color = "black", face = "bold", size = 10),
axis.text.y = element_text(color = "black", face = "bold"),
axis.text.x = element_blank()) +
labs(x = "PC axis", y = "% Variance", title = "Weighted Unifrac PCoA Screeplot")
The first two axes (31.3 + 25.7 = 57%) explain a lot of variance so this can be plotted with the first 2 axes. There are negative eigenvalues towards the end of this screeplot. But since these eigenvalues are small relative to the primary axes, they can be ignored.
Visualize in 2D
pcoa_wuf <- plot_ordination(ps_hellinger, out.wuf.log, shape="Size_fraction", color = "Salinity") +
geom_point(size = 4) +
scale_color_distiller(palette = "Spectral", direction = 1, limits = c(min(sample_data(ps_hellinger)$Salinity), max(sample_data(ps_hellinger)$Salinity)), name = 'Salinity') +
ggtitle('Weighted Unifrac PCoA Ordination') +
coord_fixed(ratio = 1) +
theme_bw()
pcoa_wuf
ggsave("figures/pcoa_wuf.eps",pcoa_wuf, width = 7, height = 5, units = c("in"))
There are very similar patterns here compared to the PCA and PhILR plots. I wonder if there is difference among Pfree/Total if we use an unweighted unifrac.
Next try a Unweighted Unifrac distance matrix with PCoA. Unifrac prioritizes differences in relatedness (of taxa) among sample. Using unweighted unifrac means we are weighing all taxa the same, no matter their relative abundance.
out.uf.log <- ordinate(ps_hellinger, method = "PCoA", distance = "unifrac")
evals <- out.uf.log$values$Eigenvalues
# Extract variances from pcoa, from jaccard calculated dist. metric
uf_variances <- data.frame(out.uf.log$values$Relative_eig) %>%
select(PercVar = 'out.uf.log.values.Relative_eig') %>%
rownames_to_column(var = "PCaxis") %>%
data.frame
head(uf_variances)
# Make a screeplot
ggplot(uf_variances, aes(x = as.numeric(PCaxis), y = PercVar)) +
geom_bar(stat = "identity", fill = "grey", color = "black") +
theme_minimal() +
theme(axis.title = element_text(color = "black", face = "bold", size = 10),
axis.text.y = element_text(color = "black", face = "bold"),
axis.text.x = element_blank()) +
labs(x = "PC axis", y = "% Variance", title = "Unweighted Unifrac PCoA Screeplot")
The first two axes (23.7 + 16.3 = 40%) explain most of variance on their own (3rd axis is only 7.7%). Still, the first two axes are lower than previous ordinations so interpret with caution. Plot in a 2D plot.
Visualize in 2D
pcoa_uf <- plot_ordination(ps_hellinger, out.uf.log, shape="Size_fraction", color = "Salinity") +
geom_point(size = 4) +
scale_color_distiller(palette = "Spectral", direction = 1, limits = c(min(sample_data(ps_hellinger)$Salinity), max(sample_data(ps_hellinger)$Salinity)), name = 'Salinity') +
ggtitle('Unweighted Unifrac PCoA Ordination') +
coord_fixed(ratio = 1) +
theme_bw()
pcoa_uf
ggsave("figures/pcoa_uf.eps",pcoa_uf, width = 7, height = 5, units = c("in"))
When you compare the unweighted to the weighted unifrac PCoA, there is some more distance between the Pfree vs Total samples in the UF, indicating that the differences between these samples is driven by taxa that are phylogenetically different, whether they are abundant or rare.
Lastly, try a non-metric dimensional scaling ordination. PCA/PCoA are metric and attempt to rotate axes to fit the distance matrix distribution. An NMDS represents the data in 2-axes, by constraining the distribution of the points. Similar to above, this can be combined with different pre-treatment of the data.
First try the compositional approach, an NMDS on CLR-tranformed data using the Euclidean distances (aka Aitchison distance)
euc_dmat<-dist(clr_asv_table_ps_filtered, method = "euclidean") # Build the Aitchison distance matrix
euc_nmds <- metaMDS(euc_dmat, k=2, autotransform=FALSE) # Run the ordination
euc_nmds$stress #Check the stress. Less than 0.1 is good. Less than 0.5 is better. This will be different each time, since it is iteratively finding a unique solution each time (although the should look similar)
# Extract points from nmds and merge into data frame with metadata
euc_nmds_df <- data.frame(euc_nmds$points) %>%
rownames_to_column(var = "#SampleID")
# Merge metadata into the pcoa data table
euc_nmds_df <- left_join(euc_nmds_df, metadata, by = "#SampleID")
head(euc_nmds_df)
## Plotting euclidean distance NMDS
nmds_aitch <- ggplot(euc_nmds_df,aes(x = MDS1, y = MDS2, color = Salinity, shape = Size_fraction)) +
geom_point(size = 4) +
scale_color_distiller(palette = "Spectral", direction = 1, limits = c(min(sample_data(ps_filtered)$Salinity), max(sample_data(ps_filtered)$Salinity)), name = 'Salinity') +
theme_bw() +
labs(x = "NMDS 1", y = "NMDS 2", title = paste0('Aitchison Distance NMDS, Stress = ', round(euc_nmds$stress,2))) +
coord_fixed(ratio = 1)
nmds_aitch
ggsave("figures/nmds_aitch.eps",nmds_aitch, width = 7, height = 5, units = c("in"))
The above has a relatively high stress (>0.1) so should be interpreted with caution. There is some separation according to salinity. There are several ‘Total’ samples that separate from the rest that do not conform to the salinity pattern.
Next try a Jaccard NMDS, which will represent differences in presence/absence among samples, emphasizing both abundant and rare taxa the same
jac_nmds <- metaMDS(jac_dmat, k=2, autotransform=FALSE) # Run the ordination. Distance matrix was already calculated above
jac_nmds$stress #Check the stress. Less than 0.1 is good. Less than 0.5 is better. This will be different each time, since it is iteratively finding a unique solution each time (although the should look similar)
# Extract points from nmds and merge into data frame with metadata
jac_nmds_df <- data.frame(jac_nmds$points) %>%
rownames_to_column(var = "#SampleID")
# Merge metadata into the pcoa data table
jac_nmds_df <- left_join(jac_nmds_df, metadata, by = "#SampleID")
head(jac_nmds_df)
## Plotting euclidean distance NMDS
nmds_jaccard <- ggplot(jac_nmds_df,aes(x = MDS1, y = MDS2, color = Salinity, shape = Size_fraction)) +
geom_point(size = 4) +
scale_color_distiller(palette = "Spectral", direction = 1, limits = c(min(sample_data(ps_hellinger)$Salinity), max(sample_data(ps_hellinger)$Salinity)), name = 'Salinity') +
theme_bw() +
labs(x = "NMDS 1", y = "NMDS 2", title = paste0('Jaccard Distance NMDS, Stress = ', round(jac_nmds$stress,2))) +
coord_fixed(ratio = 1)
nmds_jaccard
ggsave("figures/nmds_jaccard.eps",nmds_jaccard, width = 7, height = 5, units = c("in"))
This is still a relatively high stress (>0.1) so should be interpreted with caution. The pattern along salinity gradient is similar here. There is some separation of Total/Pfree.
Next try a Bray-Curis NMDS, which will represent differences in presence/absence among samples and relative abundance, thus emphasizing impacts of highly abundant taxa.
bray_nmds <- metaMDS(bray_dmat, k=2, autotransform=FALSE) # Run the ordination. Distance matrix was already calculated above
bray_nmds$stress #Check the stress. Less than 0.1 is good. Less than 0.5 is better. This will be different each time, since it is iteratively finding a unique solution each time (although the should look similar)
# Extract points from nmds and merge into data frame with metadata
bray_nmds_df <- data.frame(bray_nmds$points) %>%
rownames_to_column(var = "#SampleID")
# Merge metadata into the pcoa data table
bray_nmds_df <- left_join(bray_nmds_df, metadata, by = "#SampleID")
head(bray_nmds_df)
## Plotting euclidean distance NMDS
nmds_bray <- ggplot(bray_nmds_df,aes(x = MDS1, y = MDS2, color = Salinity, shape = Size_fraction)) +
geom_point(size = 4) +
scale_color_distiller(palette = "Spectral", direction = 1, limits = c(min(sample_data(ps_hellinger)$Salinity), max(sample_data(ps_hellinger)$Salinity)), name = 'Salinity') +
theme_bw() +
labs(x = "NMDS 1", y = "NMDS 2", title = paste0('Bray-Curtis Distance NMDS, Stress = ', round(bray_nmds$stress,2))) +
coord_fixed(ratio = 1)
nmds_bray
ggsave("figures/nmds_bray.eps",nmds_bray, width = 7, height = 5, units = c("in"))
Very similar to Jaccard results. High-ish stress (0.14) The pattern along salinity gradient is same. There is some separation of Total/Pfree.
Next try a Unweighted Unifrac NMDS, which will represent differences in phylogenetic relatedness of taxa among samples. This is not weighted by relative abundance, thus emphasizing impacts of both abundance and rare taxa.
# Calculate the ordination using phyloseq function, ordinate (vegan does not have method Unifrac)
out.nmds.uf.log <- ordinate(ps_hellinger, method = "NMDS", distance = "unifrac")
evals <- out.nmds.uf.log$values$Eigenvalues
nmds_uf_plot = plot_ordination(ps_hellinger, out.nmds.uf.log, shape="Size_fraction", color = "Salinity") +
geom_point(size = 4) +
scale_color_distiller(palette = "Spectral", direction = 1, limits = c(min(sample_data(ps_hellinger)$Salinity), max(sample_data(ps_hellinger)$Salinity)), name = 'Salinity') +
ggtitle(paste0('Unweighted Unifrac NMDS, Stress = ', round(out.nmds.uf.log$stress,2))) +
coord_fixed(ratio = 1) +
theme_bw()
nmds_uf_plot
ggsave("figures/nmds_uf_plot.eps",nmds_uf_plot, width = 7, height = 5, units = c("in"))
Again, the stress here (0.17) is a bit too high to say anything confidently. There seems to be some pattern with salinity and some more dramatic separation of Total/Pfree.
Next a Weighted Unifrac NMDS, which will represent differences in phylogenetic relatedness of taxa among samples and is weighted by relative abundance, thus emphasizing impacts of abundant
# Calculate the ordination using phyloseq function, ordinate (vegan does not have method Unifrac)
out.nmds.wuf.log <- ordinate(ps_hellinger, method = "NMDS", distance = "wunifrac")
evals <- out.nmds.wuf.log$values$Eigenvalues
nmds_wuf_plot = plot_ordination(ps_hellinger, out.nmds.wuf.log, shape="Size_fraction", color = "Salinity") +
geom_point(size = 4) +
scale_color_distiller(palette = "Spectral", direction = 1, limits = c(min(sample_data(ps_hellinger)$Salinity), max(sample_data(ps_hellinger)$Salinity)), name = 'Salinity') +
ggtitle(paste0('Weighted Unifrac NMDS, Stress = ', round(out.nmds.wuf.log$stress,2))) +
coord_fixed(ratio = 1) +
theme_bw()
nmds_wuf_plot
ggsave("figures/nmds_wuf_plot.eps",nmds_wuf_plot, width = 7, height = 5, units = c("in"))
Again, the stress here (0.16) is too high to say anything confidently. There seems to be no/ weak pattern with salinity. The Total/Pfree points are closer together than in the unweighted unifrac, suggesting that differences in Pfree and Total were driven moreso by rare taxa.
Overall, NMDS does not seem like a good approach. The stresses are too high.
I think the compositional approaches (PCA and PhILR) tell an interesting story: Salinity and temperature have an effect on the composition of these samples. When you account for phylognetic relatedness (with PhILR), the most variance is explained in two axes (almost 60%). The PhILR compared to the PCA also demonstrates that the PA and FL samples are composed of phylogenetically distinct taxa.
I am interested in pulling out which taxa define the different in PA and FL.
The PhILR package has some interesting capabilities to do this using sparse logistic regressions.
Load glmnet and fit sparse logistic regression model (Note, if deriving a variable based on another with more than 2 factors, you need some extra lines. See tutorial linked above for details)
library(glmnet)
glmmod <- glmnet(ps.philr, sample_data(ps)$Size_fraction, alpha=1, family="binomial")
Identify the balances that define Total vs. Pfree. Use the default parameters from the tutorial for now.
top.coords <- as.matrix(coefficients(glmmod, s=0.2526))
top.coords <- rownames(top.coords)[which(top.coords != 0)]
(top.coords <- top.coords[2:length(top.coords)]) # remove the intercept as a coordinate
Visualize the balance values with respect to Total/ Pfree for each of these 3 balances
ps.philr.long <- convert_to_long(ps.philr, get_variable(ps, 'Size_fraction')) %>%
filter(coord %in% top.coords)
ggplot(ps.philr.long, aes(x=labels, y=value)) +
geom_boxplot(fill='lightgrey') +
facet_grid(.~coord, scales='free_x') +
xlab('Size fraction') + ylab('Balance Value') +
theme_bw()
There are 3 balances which represent the differences. Find out what those are:
tc.names <- sapply(top.coords, function(x) name.balance(tree2, tax2, x))
tc.names
These balance names represent the highest node on the tree, which can include many taxa below it (which is why, for example, n5 is comparing Bacteria and Bacteria- there are different taxa under each of these nodes).
Check full taxonomy of each of the 3 balances- First n5:
votesn5 <- name.balance(tree2, tax2, 'n5', return.votes = c('up', 'down'))
votesn5[[c('up.votes', 'phylum')]] # Numerator at Class Level
votesn5[[c('down.votes', 'phylum')]] # Denomenator
And visualize the position of the n5 node on a tree:
library(ggtree)
tc.nn <- name.to.nn(tree2, top.coords)
tc.colors <- c('#a6cee3')
p <- ggtree(tree2, layout='fan') +
geom_balance(node=tc.nn[1], fill=tc.colors[1], alpha=0.6)
p <- annotate_balance(tree2, 'n5', p=p, labels = c('n5+', 'n5-'),
offset.text=0.15, bar=FALSE)
p
The above shows the split between the numerator of the balance (n5+: Cyanobacteria, Firmicutes, Fusobacteriota) and the denomenator (n5-: Acidobacteriota, Actinobacteriota, Armatimonadota, Bacteroidota, Chloroflexi, Cloacimonadota, Crenarchaeota, Gemmatimonadota, Marinimicrobia (SAR406 clade), Nitrospinota, Nitrospirota, Patescibacteria, Planctomycetota, SAR324 clade(Marine group B) and Verrucomicrobiota).
I am not positive what is represented by the numerator and denomenator here (Total or Pfree), although I can guess that n5+ [numerator] is Total because a) that is the first factor that appears in ps$Size_fraction and b) there are less phyla (3 in n5+ vs. 15 in n5-) and it is a smaller chunk on the tree. I will check the relative abundance of some of these phyla, facet-wrapped by size fraction, to confirm which balances are in higher [relative] abundance in Total vs. Pfree:
First make a function that plots pyloseq data as bars but doesn’t include those black lines (from here )
#
my_plot_bar = function (physeq, x = "Sample", y = "Abundance", fill = NULL, title = NULL,
facet_grid = NULL) {
mdf = psmelt(physeq)
p = ggplot(mdf, aes_string(x = x, y = y, fill = fill))
p = p + geom_bar(stat = "identity", position = "stack")
p = p + theme(axis.text.x = element_text(angle = -90, hjust = 0))
if (!is.null(facet_grid)) {
p <- p + facet_grid(facet_grid)
}
if (!is.null(title)) {
p <- p + ggtitle(title)
}
return(p)
}
# Check cyanobacteria, which is in the numerator for balance n5
ps_cyano_ra <- subset_taxa(ps_ra, phylum == "Cyanobacteria")
ps_cyano_barplot <- my_plot_bar(ps_cyano_ra) +
facet_wrap(~Size_fraction, scales = "free_x")
ps_cyano_barplot
These seem more abundant in the Total size fraction, which would be consistent with numerator being the “Total” fraction.
# Check Firmicutes, which is in the numerator for balance n5
ps_firmi_ra <- subset_taxa(ps_ra, phylum == "Firmicutes")
ps_firmi_barplot <- my_plot_bar(ps_firmi_ra) +
facet_wrap(~Size_fraction, scales = "free_x")
ps_firmi_barplot
Firmicutes reach higher abundances in Total size fraction, but on average I wouldn’t necessarily say they are different between size fractions (but remember this plot is looking at relative abundance, not composition, like PhILR)
# Check Fusobacteriota, which is in the numerator for balance n5
ps_fuso_ra <- subset_taxa(ps_ra, phylum == "Fusobacteriota")
ps_fuso_barplot <- my_plot_bar(ps_fuso_ra) +
facet_wrap(~Size_fraction, scales = "free_x")
ps_fuso_barplot
These also seem to be in higher abundance in the total, so I am beginning to decipher that the numerator of the balance is “Total” and the denomenator is “Pfree”.
Check some from the denomenator (which according to my logic should be in higher abundance in Pfree)
# Check Bacteroidota, which is in the denomenator for balance n5
ps_bacteroid_ra <- subset_taxa(ps_ra, phylum == "Bacteroidota")
ps_bacteroid_barplot <- my_plot_bar(ps_bacteroid_ra) +
facet_wrap(~Size_fraction, scales = "free_x")
ps_bacteroid_barplot
These relative abundances seem similar in both size fractions, but perhaps a little higher on average in the Pfree
# Check Verrucomicrobiota, which is in the denomenator for balance n5
ps_verruco_ra <- subset_taxa(ps_ra, phylum == "Verrucomicrobiota")
ps_verruco_barplot <- my_plot_bar(ps_verruco_ra) +
facet_wrap(~Size_fraction, scales = "free_x")
ps_verruco_barplot
These also seem similar in both size fractions, not necessarily higher in Pfree. Could be because these are relative abdundances and PhILR is assessing composition (the ILR transformed abudnance data). Check one group that has only very low rel abundance overall…
# Check Nitrospinota, which is in the denomenator for balance n5 and has lower abundance overall, so might be more obvious
ps_nitrospino_ra <- subset_taxa(ps_ra, phylum == "Nitrospinota")
ps_nitrospino_barplot <- my_plot_bar(ps_nitrospino_ra) +
facet_wrap(~Size_fraction, scales = "free_x")
ps_nitrospino_barplot
This is definitely in higher relative abundance in the Total, not what I was expecting… This makes me a bit suspicious. Some of these mismatches in interpretation of PhILR results and the relative abundances can be explained by differences in transformed (compositional) data and relative abundance. But here, this balance is barely present in the Pfree fraction. But it is in the denomenator of this balance comparison, so I am not confident about this method yet.
Check full taxonomy of n29
votesn1114 <- name.balance(tree2, tax2, 'n29', return.votes = c('up', 'down'))
votesn1114[[c('up.votes', 'order')]] # Numerator at order Level
votesn1114[[c('down.votes', 'order')]] # Denomenator at order level
And visualize the position of the n29 node on a tree:
tc.nn <- name.to.nn(tree2, top.coords)
tc.colors <- c('#a6cee3')
p <- ggtree(tree2, layout='fan') +
geom_balance(node=tc.nn[2], fill=tc.colors[1], alpha=0.6)
p <- annotate_balance(tree2, 'n29', p=p, labels = c('n29+', 'n29-'),
offset.text=0.15, bar=FALSE)
p
This region is a very small part of the tree, within the Gammaproteobacteria. Hard to decipher from the full tree.
Check the relative abundance of some of the identified groups from balance n29 in Total vs. Pfree. First CCM19a, which is in the numerator:
ps_CCM19a_ra <- subset_taxa(ps_ra, order == "CCM19a")
ps_CCM19a_barplot <- my_plot_bar(ps_CCM19a_ra) +
facet_wrap(~Size_fraction, scales = "free_x")
ps_CCM19a_barplot
Definitely in higher abundance in Total and is present in numerator (n29+)
Check something from the denomenator, CCD24
ps_CCD24_ra <- subset_taxa(ps_ra, order == "CCD24")
ps_CCD24_barplot <- my_plot_bar(ps_CCD24_ra) +
facet_wrap(~Size_fraction, scales = "free_x")
ps_CCD24_barplot
So while the abundance of CCD24 is higher in Pfree than CCM19a is, CCD24 is still in higher relative abundance in Total than Pfree. I think there might be a relative abudnance bias against finding the Pfree- favored groups, that is being oversome by the PhILR method, which is why I keep seeing this. Try one more from the denomenator,EPR3968-O8a-Bc78
ps_EPR3968_ra <- subset_taxa(ps_ra, order == "EPR3968-O8a-Bc78")
ps_EPR3968_barplot <- my_plot_bar(ps_EPR3968_ra) +
facet_wrap(~Size_fraction, scales = "free_x")
ps_EPR3968_barplot
Here, the relative abundance definitely seems higher in Pfree, which is consistent with this group being in the denomenator.
ALDEx2 is a compositional approach to differential abundance that is similar to ANOVA. It models the probability of observing a count as a log-ratio transformed probability distribution rather than as direct counts, therefore correcting for differences in sequencing depth and being insensitive to the removal of ASVs from a sample.
library(ALDEx2)
The ALDEx t-test can be run in one step but is doing a lot of things. Here’s a quick summary from Nicholas Olberding’s website:
aldex2_ps <- ALDEx2::aldex(data.frame(phyloseq::otu_table(ps)), phyloseq::sample_data(ps)$Size_fraction, test="t", effect = TRUE, denom="iqlr")
Plot effect size
#Plot effect sizes
ALDEx2::aldex.plot(aldex2_ps, type="MW", test="wilcox", called.cex = 1, cutoff = 0.05)
Differentially abundant taxa are those where the difference exceeds dispersion. Points in the top are more abundant in the Total samples and towards the bottom are more abudnance in the Pfree samples. Highlighted in red are taxa with BH-FDR corrected p-values.
However, the authors of ANDEx2 recommend using effect size rather than p-values:
# Modify taxonomy table
ps_taxonomy <- tax_table(ps) %>%
as("matrix") %>%
tibble::as_tibble(rownames = "ASV")
#Clean up presentation of table
sig_aldex2 <- aldex2_ps %>%
rownames_to_column(var = "ASV") %>%
filter(wi.eBH < 0.05) %>%
arrange(effect, wi.eBH) %>%
dplyr::select(ASV, diff.btw, diff.win, effect, wi.ep, wi.eBH)
# Match ALDEx table to taxonmy table and view
sig_aldex2 <- left_join(sig_aldex2, ps_taxonomy, by = "ASV")
sig_aldex2
Table is arranged by diff.btw and effect. diff.btw is the median difference in clr between the Total and Pfree groups. Effect is the the diff.btw/ max(diff.win) for all instances. According to the authors, effect size is the key metric here: it is robust for a variety of distributions and tells us which ASVs are reproducibly different between groups. It is also
The taxa with negative effect values are those with higher abundance in Pfree and those with positive effect values are those in higher abundance in Total (based on plot above). There is some inconsistency here with the PhILR results-
Negative values (Pfree): Several ASVs from Bacteroidota, several from Gammaproteobacteria (Burkholderiales), 2 SAR11 clades, 1 Actinobacteria. Positive values (Total): there are many but here is a brief summary Several different Alpha and Gammaproteobacteria, including the EPR3968-O8a-Bc78 and Steroidobacterales ASVs that came up before, several Chloroplast ASVs, several Desulfobacterota, several Bacteroidota, several Chloroflexi, several Nitrospirota, several Verrucomicrobiota, and more…
While the chloroplasts seem to be consistenly in the Total size fraction for both the PhILR and ANDEx2 approaches (which makes sense, since these are eukaryotic), other groups like Nitrospirota, Verrucomicrobiota, Gammaproteobacteria etc. are in the Total group by ALDEx method but in the Pfree according to the PhILR method. I think the ALDEx2 method is more widely accepted for differential abundance analysis of microbiome data. Still, I should check at least one more approach to be confident…
Above step takes a bit (20-30 mins) so save variables again up to this point incase
Since that step takes a long time, save all variables up to this point in environmnet as RData object
# save.image("HREmicrobiome_analysis_vars_upto_aldex2.RData")
If variables are lost, come back and load everything up to now:
# load("/Volumes/MyPassport/HREMicrobiomeproject2019_LargeFiles/dada2_decipher_analysis_files/post_analysis_R/HREmicrobiome_analysis_vars_upto_aldex2.RData")
Note on the above: Copied the Rdata object into my external storage. Load it from there.
Following Mandal et al. 2015 and Kaul et al. 2017. ANCOM is an Aitchison log-ratio approach to differential abundance. ANCOM-II is the same approach as ANCOM, with a pre-processing step that corrects for excessive zeroes (which most microbiome data have). Here I am following the tutorial from here, specifically the moving_pics.R tutorial in the scripts directory.
Clone the ANCOM repository (in terminal)
git clone https://github.com/FrederickHuangLin/ANCOM.git
library(exactRankTests)
library(nlme)
source("ANCOM/scripts/ancom_v2.1.R")
Read in data and do some pre-processing for correct format. Based on Kaul et al. 2017, the types of zeroes we would have in this dataset should be considered structural zeroes.
feature_table <- as.data.frame(otu_table(ps)) # absolute abundance table; taxa as rows
meta_data <- as.data.frame(sample_data(ps)) # meta data table
colnames(meta_data)[1] <- "Sample.ID" # simplfy column name of sample IDs from meta_data
sample_var = "Sample.ID"; # indicate this name in a variable
group_var = NULL # required for detecting structural zeroes and outliers.
out_cut = 0.05; # observations with proportion of mixture distribution less than out_cut will be detected as outlier zeros
zero_cut = 0.90; # Taxa with proportion of zeroes greater than zero_cut are not included in the analysis
lib_cut = 1000; # Samples with library size less than lib_cut are not included in the analysis
neg_lb = TRUE; # TRUE indicates a taxon would be classified as a structural zero in the corresponding experimental group using its asymptotic lower bound. More specifically, neg_lb = TRUE indicates you are using both criteria stated in section 3.2 of ANCOM-II to detect structural zeros
Pre-processing- to detect structural zeroes
prepro = feature_table_pre_process(feature_table, meta_data, sample_var, group_var,
out_cut, zero_cut, lib_cut, neg_lb)
feature_table = prepro$feature_table # Preprocessed feature table
meta_data = prepro$meta_data # Preprocessed metadata
struc_zero = prepro$structure_zeros # Structural zero info
ANCOM step- takes a while so comment out for now
# main_var = "Size_fraction";
# p_adj_method = "BH"; # default method is Benjamini-Hochberg procedure
# alpha = 0.05 # significance cutoff
# adj_formula = NULL; rand_formula = NULL
# t_start = Sys.time()
# res = ANCOM(feature_table, meta_data, struc_zero, main_var, p_adj_method,
# alpha, adj_formula, rand_formula)
# t_end = Sys.time()
# t_run = t_end - t_start # around 30s
#
# write_csv(res$out, "ancom_results.csv")
t_run
Above step took >24 hours so save variables again up to this point incase
Since that step takes a long time, save all variables up to this point in environmnet as RData object
save.image("HREmicrobiome_analysis_vars_upto_ancom.RData")
If variables are lost, come back and load everything up to now:
# load("/Volumes/MyPassport/HREMicrobiomeproject2019_LargeFiles/dada2_decipher_analysis_files/post_analysis_R/HREmicrobiome_analysis_vars_upto_ancom.RData")
Note on the above: Copied the Rdata object into my external storage. Load it from there.
Plot- FIGURE THIS OUT
# Step 3: Volcano Plot
# Number of taxa except structural zeros
n_taxa = ifelse(is.null(struc_zero), nrow(feature_table), sum(apply(struc_zero, 1, sum) == 0))
# Cutoff values for declaring differentially abundant taxa
cut_off = c(0.9 * (n_taxa -1), 0.8 * (n_taxa -1), 0.7 * (n_taxa -1), 0.6 * (n_taxa -1))
names(cut_off) = c("detected_0.9", "detected_0.8", "detected_0.7", "detected_0.6")
# Annotation data
dat_ann = data.frame(x = min(res$fig$data$x), y = cut_off["detected_0.9"], label = "W[0.9]")
fig = res$fig +
geom_hline(yintercept = cut_off["detected_0.9"], linetype = "dashed") +
geom_text(data = dat_ann, aes(x = x, y = y, label = label),
size = 4, vjust = -0.5, hjust = 0, color = "orange", parse = TRUE)
fig
#ggsave("images/moving_pics.jpeg", height=5, width=6.25, units='in', dpi = 300)